In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "sinarashidi/llama-2-7b-chat-persian"
# new_model = "llama-2-7b-chat-persian"
device_map = {"": 0}

In [2]:
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [4]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training


Downloading (…)lve/main/config.json: 100%|██████████| 632/632 [00:00<00:00, 6.02MB/s]
Downloading (…)model.bin.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 2.62MB/s]
Downloading (…)l-00001-of-00002.bin: 100%|██████████| 9.98G/9.98G [23:21<00:00, 7.12MB/s]
Downloading (…)l-00002-of-00002.bin: 100%|██████████| 3.50G/3.50G [08:09<00:00, 7.16MB/s]
Downloading shards: 100%|██████████| 2/2 [31:32<00:00, 946.50s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.31s/it]
Downloading (…)neration_config.json: 100%|██████████| 174/174 [00:00<00:00, 2.03MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 695/695 [00:00<00:00, 7.40MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 3.07MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 434/434 [00:00<00:00, 4.67MB/s]


In [11]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "عملکرد مادربرد کامپیوتر را توضیح دهید"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] عملکرد مادربرد کامپیوتر را توضیح دهید [/INST] مادربرد کامپیوتر یک تکنیک برای تولید موجودی است که با موجودی دیگر مشابه است. این تکنیک برای تولید موجودی استفاده می شود که با موجودی دیگر مشابه است، اما با اندازه های مختلف یا موجودی های متفاوت تولید می شود. این می تواند به افزایش تعداد موجودی های تولید شده کمک کند و به کاهش هزینه های تولید کمک کند و به کاهش تعداد موجودی های تولید شده کمک کند که به عنوان موجودی های انبوه شناخته می شوند. با استفاده از مادربرد کامپیوتر، می توانیم موجودی های متفاوت را 


In [3]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


In [4]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

pytorch_model-00001-of-00002.bin:  40%|███▉      | 3.96G/9.98G [20:17<55:51, 1.80MB/s]    

KeyboardInterrupt: 

pytorch_model-00001-of-00002.bin:  40%|███▉      | 3.96G/9.98G [20:32<55:50, 1.80MB/s]

In [11]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] What is a language model? [/INST]  A language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. everybody. Language models are typically trained on vast amounts of text data, such as books, articles, and websites, and are designed to learn the patterns and structures of language.

The goal of a language model is to be able to generate text that is similar to the training data, but not necessarily identical to it. This can involve generating text that is coherent and contextually appropriate, as well as producing novel and creative text that goes beyond the training data.

There are several types of language models, including:

1. Neural network-based language models: These models use deep neural networks to learn the patterns and structures of language. They are typically trained on large datasets of text and can generate text that is coherent and natural-soundi

In [32]:
import pandas as pd


output = []
data = pd.read_csv('./data/2.train_claim_data_text.csv')
data.rename(columns={'Not Claim': 'Not_Claim', 'Support/Oppose': 'Support_Oppose', 'Causation/Correlation': 'Causation_Correlation', 'Rule/Law': 'Rule_Law', 'Other Claim': 'Other'}, inplace=True)
data.columns

Index(['local_id', 'tweet_id', 'data', 'Not_Claim', 'Trait', 'Action',
       'Support_Oppose', 'Prediction', 'Quantity', 'Causation_Correlation',
       'Rule_Law', 'Quote', 'Other', 'Comparision'],
      dtype='object')

In [33]:
for d in data.itertuples():
    question = f'Perform claim detection on the following sentence: {d.data}'
    answer = ''
    if d.Not_Claim == 1:
        answer += "غیر ادعا،"
    if d.Trait == 1:
        answer += "صفت،"
    if d.Action == 1:
        answer += "عمل،"
    if d.Support_Oppose == 1:
        answer += "موافقت/مخالفت،"
    if d.Prediction == 1:
        answer += "پیش بینی،"
    if d.Quantity == 1:
        answer += "آمار،"
    if d.Causation_Correlation == 1:
        answer += "علیت،"
    if d.Rule_Law == 1:
        answer += "قوانین،"
    if d.Quote == 1:
        answer += "نقل قول،"
    if d.Other == 1:
        answer += "غیره،"
    if d.Comparision == 1:
        answer += "مقایسه"

    output.append({'text': f'### Human: \n{question} ### Assistant: \n{answer}'})

In [34]:
import json


with open('claim.jsonl', 'w') as f:
    for o in output:
        json.dump(o, f, ensure_ascii=False)
        f.write('\n')

In [37]:
data = pd.read_csv('./data/2.train_stance_data_text.csv')
data.head()

Unnamed: 0,id,post_id,post_text,reply_id,reply_text,Against,Support,Neither
0,4471,1540977338040581888,وظیفه خود در اجرایی کردن بخشنامه‌ها و فرامین ح...,1540977420563513088,عزل حسین طائب از ریاست اطلاعات سپاه بعنوان بمب...,0,1,0
1,5319,1494077275997888512,کمیسیون اصل ۹۰ بعد از گذشت چندین ماه از طرح شک...,1494085073670447104,از کمیسیون اصل90 درخواست میکنیم درمورد درنظر...,0,1,0
2,3741,1543639992110653440,... انشاالله خدا او را به خاطر خدماتش بیامرزد....,1543654831537127424,انسان‌ها مجموعه ای از رفتارهای صحیح و ناصحیح ه...,0,1,0
3,6272,1492293577556336640,گزارش در مورد #عیسی_شریفی را در سال ۹۵ نوشتم و...,1492374723669413888,اما عیسی شریفی کرمانشاهی نیست!,0,0,1
4,1580,1545306680203812864,تکمیلی/ تزریق مستمر خون به نخست‌وزیر سابق ژاپن...,1545333584977879040,نخست‌وزیر سابق ژاپن جان باخت\n\n🔹 شبکه «اِن‌اِ...,0,1,0


In [38]:
output = []
for d in data.itertuples():
    question = f'Perform stance detection on the following sentence and its reply sentence:\nSentence: {d.post_text}\nReply sentence: {d.reply_text}'
    answer = ''
    if d.Against == 1:
        answer += "مخالف"
    if d.Support == 1:
        answer += "موافق"
    if d.Neither == 1:
        answer += "هیچکدام"
    
    output.append({'text': f'### Human: \n{question} ### Assistant: \n{answer}'})

In [40]:
with open('stance.jsonl', 'w') as f:
    for o in output:
        json.dump(o, f, ensure_ascii=False)
        f.write('\n')

In [43]:
import json

def combine_jsonl_files(input_files, output_file):
    combined_data = []

    # Read data from input JSONL files
    for input_file in input_files:
        with open(input_file, 'r') as f:
            for line in f:
                json_object = json.loads(line.strip())
                combined_data.append(json_object)

    # Write combined data to a new JSONL file
    with open(output_file, 'w') as f:
        for json_object in combined_data:
            json_line = json.dumps(json_object, ensure_ascii=False)
            f.write(json_line + '\n')

# List of input JSONL files to combine
input_files = ['stance.jsonl', 'claim.jsonl', 'sentiment.jsonl']

# Output JSONL file to write combined data
output_file = 'sentiment_claim_stance.jsonl'

# Call the function to combine JSONL files
combine_jsonl_files(input_files, output_file)
