In [None]:
!pip install accelerate
!pip install pandas
!pip install evaluate
!pip install numpy
!pip install torch
!pip install transformers

In [1]:
import accelerate
import pandas as pd
from transformers import pipeline
from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau
import evaluate
import numpy as np
import torch
import warnings

  from .autonotebook import tqdm as notebook_tqdm
2024-04-15 02:25:05.324941: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-15 02:25:05.578229: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

import os

In [3]:
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

warnings.simplefilter("ignore")

# drive.mount("/content/gdrive")

# file = "/content/gdrive/MyDrive/GDSC_AI_STOCK/training_set/training_set.csv"
file = "training_set.csv"
df = pd.read_csv(file)

id2label = {-1: "negative", 0: "neutral", 1: "positive"}
label2id = {"negative": -1, "neutral": 0, "positive": 1}

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

df["label"] = df["label"].map(label2id)  # If necessary

In [4]:
train_df, val_df = train_test_split(
    df, test_size=0.3, stratify=df["label"], random_state=1
)

train_text = train_df["text"].to_list()
val_text = val_df["text"].to_list()

train_label = train_df["label"].to_list()
val_label = val_df["label"].to_list()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert = "google-bert/bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert)
model = AutoModelForSequenceClassification.from_pretrained(
    bert, num_labels=3, id2label=id2label, label2id=label2id
).to(device)


class GDSCDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_encoding = tokenizer(train_text, truncation=True, padding=True, max_length=512)
val_encoding = tokenizer(val_text, truncation=True, padding=True, max_length=512)

train_ds = GDSCDataset(train_encoding, train_label)
val_ds = GDSCDataset(val_encoding, val_label)

training_args = TrainingArguments(
    logging_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    output_dir="./results",
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    per_device_eval_batch_size=64,
    per_device_train_batch_size=64,
)

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

model_path = "./My_model/"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


Some weights of the model checkpoint at google-bert/bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-lar

Step,Training Loss,Validation Loss,Accuracy
500,0.1837,0.028318,0.992958
1000,0.0291,0.017011,0.996391
1500,0.0089,0.014985,0.996743
2000,0.002,0.011047,0.997623


***** Running Evaluation *****
  Num examples = 11361
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 11361
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 11361
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 11361
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_m

('./My_model/tokenizer_config.json',
 './My_model/special_tokens_map.json',
 './My_model/vocab.txt',
 './My_model/added_tokens.json',
 './My_model/tokenizer.json')

In [5]:
torch.cuda.empty_cache()