In [1]:
!pip install kaggle



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!mkdir ~/.kaggle

In [5]:
!cp /content/drive/MyDrive/kaggle_api_credentials/kaggle.json ~/.kaggle/

In [6]:
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
!kaggle datasets download ankurzing/sentiment-analysis-for-financial-news

Downloading sentiment-analysis-for-financial-news.zip to /content
  0% 0.00/903k [00:00<?, ?B/s]
100% 903k/903k [00:00<00:00, 158MB/s]


In [8]:
!unzip /content/sentiment-analysis-for-financial-news.zip

Archive:  /content/sentiment-analysis-for-financial-news.zip
  inflating: FinancialPhraseBank/License.txt  
  inflating: FinancialPhraseBank/README.txt  
  inflating: FinancialPhraseBank/Sentences_50Agree.txt  
  inflating: FinancialPhraseBank/Sentences_66Agree.txt  
  inflating: FinancialPhraseBank/Sentences_75Agree.txt  
  inflating: FinancialPhraseBank/Sentences_AllAgree.txt  
  inflating: all-data.csv            


# Dataset Creation

In [9]:
!pip install -q -U datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
from sklearn.model_selection import train_test_split
import pandas as pd
filename = "/content/all-data.csv"

df = pd.read_csv(filename,
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

In [11]:
df.head(10)

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
6,positive,"For the last quarter of 2010 , Componenta 's n..."
7,positive,"In the third quarter of 2010 , net sales incre..."
8,positive,Operating profit rose to EUR 13.1 mn from EUR ...
9,positive,"Operating profit totalled EUR 21.1 mn , up fro..."


In [12]:
X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment],
                                    train_size=300,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

In [13]:
def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()

In [14]:
from datasets import Dataset
def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] =

            """.strip()

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [25]:
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
import numpy as np
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

# Loading Model Gemma-7b using Unslot

In [16]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [17]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.57G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [18]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [19]:
modules = find_all_linear_names(model)
print(modules)

['o_proj', 'down_proj', 'up_proj', 'v_proj', 'gate_proj', 'q_proj', 'k_proj']


In [22]:
from tqdm import tqdm
def predict(X_test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**input_ids, max_new_tokens=1, temperature=0.0)
        result = tokenizer.decode(outputs[0])
        answer = result.split("=")[-1].lower()
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [23]:
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 900/900 [17:37<00:00,  1.17s/it]


In [26]:
evaluate(y_true, y_pred)

Accuracy: 0.667
Accuracy for label 0: 0.787
Accuracy for label 1: 0.307
Accuracy for label 2: 0.907

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.79      0.84       300
           1       0.57      0.31      0.40       300
           2       0.57      0.91      0.70       300

    accuracy                           0.67       900
   macro avg       0.68      0.67      0.65       900
weighted avg       0.68      0.67      0.65       900


Confusion Matrix:
[[236  45  19]
 [ 21  92 187]
 [  5  23 272]]


Fine Tuning the model  

Since accuracy is 66 percent

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = modules,
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [30]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=train_data,
    eval_dataset=eval_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/150 [00:00<?, ? examples/s]

In [32]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 200,015,872


Step,Training Loss
1,2.7832
2,2.849
3,2.6049
4,2.4596
5,2.571
6,2.1233
7,2.0578
8,1.6862
9,1.5756
10,1.2853


In [41]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

504.5797 seconds used for training.
8.41 minutes used for training.
Peak reserved memory = 8.594 GB.
Peak reserved memory for training = 1.949 GB.
Peak reserved memory % of max memory = 58.272 %.
Peak reserved memory for training % of max memory = 13.215 %.


In [33]:
model.save_pretrained("gemma-7b-unsloth-bfsi-Sentiment-Analysis")

# After training it the accuracy increased to 85% from 60%

In [35]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████| 900/900 [04:36<00:00,  3.25it/s]

Accuracy: 0.859
Accuracy for label 0: 0.953
Accuracy for label 1: 0.853
Accuracy for label 2: 0.770

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       300
           1       0.77      0.85      0.81       300
           2       0.86      0.77      0.81       300

    accuracy                           0.86       900
   macro avg       0.86      0.86      0.86       900
weighted avg       0.86      0.86      0.86       900


Confusion Matrix:
[[286  11   3]
 [ 10 256  34]
 [  2  67 231]]





In [36]:
evaluation = pd.DataFrame({'text': X_test["text"],
                           'y_true':y_true,
                           'y_pred': y_pred},
                         )

In [37]:
evaluation.to_csv("test_predictions.csv", index=False)

In [38]:
filename = "/content/test_predictions.csv"

output_df = pd.read_csv(filename,
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

In [39]:
output_df.head(10)

Unnamed: 0,sentiment,text
text,y_true,y_pred
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [The new agreement , which expands a long-established cooperation between the companies , involves the transfer of certain engineering and documentation functions from Larox to Etteplan .] =",positive,neutral
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [( ADP News ) - Finnish handling systems provider Cargotec Oyj ( HEL : CGCBV ) announced on Friday it won orders worth EUR 10 million ( USD 13.2 m ) to deliver linkspans to Jordan , Morocco and Ireland .] =",positive,positive
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [The world 's biggest magazine paper maker said the program to improve efficiency will include closing several of its least competitive mills and would cover all the company 's operations resulting in annual savings of some euro200 million US$ 240 million .] =",positive,positive
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [a January 11 , 2010 EPHC board of directors has approved an increase in the quarterly dividend from $ 0.03 to $ 0.05 per share .] =",positive,positive
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [With this appointment Kaupthing Bank aims to further co-ordinate Capital Markets activities within the Group and to improve the overall service to clients .] =",positive,neutral
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [ST. PETERSBURG , Oct 14 ( PRIME-TASS ) -- Finnish tire producer Nokian Tyres plans to invest about 50 million euros in the expansion of its tire plant in the city of Vsevolozhsk in Russia 's Leningrad Region in 2011 , the company 's President Kim Gran told reporters Thursday .] =",positive,positive
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [During the past decade it has gradually divested noncore assets and bought several sports equipment makers , including California-based Fitness Products International and Sparks , Nevada-based ATEC , a leading maker of baseball and softball pitching machines .] =",positive,neutral
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [- BEIJING XFN-ASIA - Hong Kong-listed Standard Chartered Bank said it has signed a China mobile phone dealer financing agreement with Nokia , making it the first foreign bank to offer financing to the country 's small and medium enterprise -LR] =",positive,positive
"Analyze the sentiment of the news headline enclosed in square brackets, \n determine if it is positive, neutral, or negative, and return the answer as \n the corresponding sentiment label ""positive"" or ""neutral"" or ""negative""\n\n [According to Schmardin , Nordea will most likely try to win customers over from other pension fund providers .] =",positive,positive


In [42]:
model.save_pretrained_merged("outputs", tokenizer, save_method = "merged_16bit",)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.42 out of 12.67 RAM for saving.


 32%|███▏      | 9/28 [00:00<00:01, 16.66it/s]We will save to Disk and not RAM now.
100%|██████████| 28/28 [01:05<00:00,  2.35s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving outputs/pytorch_model-00001-of-00004.bin...
Unsloth: Saving outputs/pytorch_model-00002-of-00004.bin...
Unsloth: Saving outputs/pytorch_model-00003-of-00004.bin...
Unsloth: Saving outputs/pytorch_model-00004-of-00004.bin...
Done.


In [44]:
from huggingface_hub import notebook_login, HfApi
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
model.push_to_hub_merged("gemma-7b-unsloth-bfsi-Sentiment-Analysis-merged", tokenizer, save_method = "merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.0 out of 12.67 RAM for saving.


100%|██████████| 28/28 [01:57<00:00,  4.19s/it]


Unsloth: Saving tokenizer...

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving gemma-7b-unsloth-bfsi-Sentiment-Analysis-merged/pytorch_model-00001-of-00004.bin...
Unsloth: Saving gemma-7b-unsloth-bfsi-Sentiment-Analysis-merged/pytorch_model-00002-of-00004.bin...
Unsloth: Saving gemma-7b-unsloth-bfsi-Sentiment-Analysis-merged/pytorch_model-00003-of-00004.bin...
Unsloth: Saving gemma-7b-unsloth-bfsi-Sentiment-Analysis-merged/pytorch_model-00004-of-00004.bin...


README.md:   0%|          | 0.00/576 [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

Done.
Saved merged model to https://huggingface.co/None/gemma-7b-unsloth-bfsi-Sentiment-Analysis-merged
