## Set Up Package

In [15]:
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/trl.git

## Set up imports and environment setting

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import transformers
import bitsandbytes as bnb
import huggingface_hub
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,TrainingArguments,pipeline,logging
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from huggingface_hub import notebook_login
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
huggingface_hub.login(token = "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", write_permission =True)


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\kensh\.cache\huggingface\token
Login successful


## Prompt Generation

In [2]:
def generate_prompt(data):
    return f"""
            Analyze the open price enclosed in the first square bracket, dayily high enclosed in the second
            square bracket, daily low in the third square bracket, close price in the fourth square barcket,
            volume on the fifth square bracket, and sentiments in the sixth square bracket, determine whether or not
            the sentiment embedded have a positive effect on the market price, a
            nd return the corresponding label of "Yes" or "No"

            [{data["open"]}] [{data["high"]}] [{data["low"]}] [{data["close"]}] [{data["volume"]}] [{data["sentiment_nltk"]}]={data["up"]}
            """.strip()

def generate_test_prompt(data):
    return f"""
             Analyze the open price enclosed in the first square bracket, dayily high enclosed in the second
            square bracket, daily low in the third square bracket, close price in the fourth square barcket,
            volume on the fifth square bracket, and sentiments in the sixth square bracket, determine whether or not
            the sentiment embedded have a positive effect on the market price, a
            nd return the corresponding label of "Yes" or "No"

            [{data["open"]}] [{data["high"]}] [{data["low"]}] [{data["close"]}] [{data["volume"]}] [{data["sentiment_nltk"]}]=
            """.strip()

## Set Up Dataset

In [3]:
file  = "../yahoo_news_preprocessed.csv"

df = pd.read_csv(file, names = ["date","open","high","low","close","volume","sentiment_nltk","up"], encoding = "utf-8", encoding_errors = "replace")
df["sentiment_nltk"] = df["sentiment_nltk"].replace({"1":"Positive","-1":"Negative","0":"Neutural"})
df["up"] = df["up"].replace({"0":"Yes","1":"No"})
df["index"] = df.index
x_train = []
x_test = []

# positive_count = (df["up"] == "Yes").sum()
# neg_count = len(df) - positive_count
# print(positive_count)
# print(neg_count)

for result in ["Yes", "No"]:
    train, test = train_test_split(df[df.up == result], train_size = 300, test_size=200, random_state=42)
    x_train.append(train)
    x_test.append(test)

x_train = pd.concat(x_train).sample(frac = 1)
x_test = pd.concat(x_test)
eval_index = [indx for indx in df.index if indx not in list(x_train.index) + list(x_test.index)]
x_eval = df[df.index.isin(eval_index)]
x_eval = x_eval.groupby("up",group_keys=False).apply(lambda x: x.sample(n=100,replace=True))
x_train = x_train.reset_index(drop=True)


x_train_prompts = pd.DataFrame(x_train.apply(generate_prompt, axis=1), columns=['prompt'])
x_eval_prompts = pd.DataFrame(x_eval.apply(generate_prompt, axis=1), columns=['prompt'])

y_test = x_test.up
x_test_prompts = pd.DataFrame(x_test.apply(generate_test_prompt, axis=1), columns=['prompt'])

train_data = Dataset.from_pandas(x_train_prompts)
eval_data = Dataset.from_pandas(x_eval_prompts)

## Evalutaion Function

In [4]:
def evaluate(y_true, y_pred):
    mapping = {"Yes":0,"No":1}
    y_true = np.vectorize(lambda x: mapping.get(x,1))(y_true)
    y_pred = np.vectorize(lambda x: mapping.get(x,1))(y_pred)

    accuracy = accuracy_score(y_true = y_true, y_pred = y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    unique_labels = set(y_pred)
    for label in unique_labels:
        label_indicies = [i for i in range(len(y_true)) if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indicies]
        label_y_pred = [y_pred[i] for i in label_indicies]
        accuracy = accuracy_score(label_y_true,label_y_pred)
        print(f"Accuracy for Label {label}: {accuracy:.3f}")

    print(f"\nClassification Report:\n{classification_report(y_true = y_true, y_pred=y_pred)}")
    print(f"\nConfusion Matrix:\n{confusion_matrix(y_true = y_true, y_pred = y_pred, labels = [0,1])}")
    

## Loading Model

In [5]:
model_name = "google/gemma-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16")
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Prediction

In [6]:
def predict(x_test_prompts, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(x_test_prompts))):
        prompt = x_test_prompts.iloc[i]["prompt"]
        input = tokenizer(prompt, return_tensors="pt").to("cuda")
        output = model.generate(**input, max_new_tokens = 1, temperature = 0.0)
        
        result = tokenizer.decode(output[0])
        answer = result.split("=")[-1]
        y_pred.append(answer)
       
       
    return y_pred

## Test the model Before fine-tunning

In [7]:
y_pred = predict(x_test_prompts,model,tokenizer)
evaluate(y_test,y_pred)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:51<00:00,  7.70it/s]

Accuracy: 0.470
Accuracy for Label 0: 0.805
Accuracy for Label 1: 0.135

Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.81      0.60       200
           1       0.41      0.14      0.20       200

    accuracy                           0.47       400
   macro avg       0.45      0.47      0.40       400
weighted avg       0.45      0.47      0.40       400


Confusion Matrix:
[[161  39]
 [173  27]]





## Training and Fine-tunning

In [10]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear",
)
training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=5,
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    evaluation_strategy='steps',
    eval_steps = 112,
    eval_accumulation_steps=1,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)

ValueError: Target module Identity() is not supported. Currently, only the following modules are supported: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`.