In [1]:
!pip install transformers




In [2]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, AdamW, GPT2Config
from tqdm import tqdm
import torch.nn as nn


In [3]:
torch.cuda.empty_cache()

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


steps:
Text Lowercasing
Tokenization
Stop Word Removal
Special Character and Number Removal
Removing URLs and Email Addresses
Lemmatization or Stemming
Handle Domain-Specific Abbreviations and Jargon
Language Detection and Removal
Removing Duplicates
Handle Missing Values
Text Length Limitation
Data Normalization
Quality Control
Feature Engineering

In [5]:
def clean_text(df):
  df['Sentence'] = df['Sentence'].str.lower() # convert all text to lower
  df['Sentence'] = df['Sentence'].str.replace(r"https?://\S+|www\.\S+"," ",regex = True) # remove all URLs
  df['Sentence'] = df['Sentence'].str.replace(r"#[A-Za-z0-9_]+"," ", regex = True) #remove all hashtags
  df['Sentence'] = df['Sentence'].str.replace(r"@","at", regex = True) #replacing @ with at
  df['Sentence'] = df['Sentence'].str.replace(r"[^A-Za-z(),!?@\'\"_\n]"," ", regex = True)
  return df


In [6]:
def preprocess_data(path):
  df = pd.read_csv(path)
  df = clean_text(df)
  return df

In [8]:
data = preprocess_data('/content/gdrive/MyDrive/Market Data/Stock-Market-News-Dataset.csv')

label_encoder = LabelEncoder()
data['Sentiment'] = label_encoder.fit_transform(data['Sentiment'])

# Check the mapping between classes and numerical labels
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Class Mapping:", class_mapping)


Class Mapping: {'negative': 0, 'neutral': 1, 'positive': 2}


In [9]:
data

Unnamed: 0,Sentence,Sentiment
0,the geosolutions technology will leverage bene...,2
1,"esi on lows, down to bk a real po...",0
2,"for the last quarter of , componenta 's n...",2
3,according to the finnish russian chamber of co...,1
4,the swedish buyout firm has sold its remaining...,1
...,...,...
5837,rising costs have forced packaging producer hu...,0
5838,nordic walking was first used as a summer trai...,1
5839,"according shipping company viking line , the e...",1
5840,"in the building and home improvement trade , s...",1


Splitting data into train test validation data using stratified sampling to avoid heavy bias towards one class

In [10]:
from sklearn.model_selection import train_test_split

train_size = 0.8
val_size = 0.1
test_size = 0.1

documents = data['Sentence']
labels = data['Sentiment']

X_train, X_temp, y_train, y_temp = train_test_split(documents, labels, test_size= 1 - train_size, random_state=42, stratify = labels)

# Split the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size/(val_size + test_size), random_state=42, stratify = y_temp)

print("X_train size: ", X_train.size)
print("y_train size: ", y_train.size)
print("X_val size: ", X_val.size)
print("y_val size: ", y_val.size)
print("X_test size: ", X_test.size)
print("y_test size: ", y_test.size)


X_train size:  4673
y_train size:  4673
X_val size:  584
y_val size:  584
X_test size:  585
y_test size:  585


Creating Dataset class to pass into dataloader

In [11]:
class MarketSentimentDataset(Dataset):
  def __init__(self, documents, labels, tokenizer, max_len):
          self.documents = documents
          self.labels = labels
          self.tokenizer = tokenizer
          self.max_len = max_len

  def __len__(self):
      return len(self.labels)

  def __getitem__(self, idx):
      text = self.documents.iloc[idx]
      label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)

      # Tokenize the text using RoBERTa tokenizer
      inputs = self.tokenizer.encode_plus(
          text,
          None,
          add_special_tokens=True,
          max_length=self.max_len,
          pad_to_max_length=True,
          return_token_type_ids=True,
          truncation=True)
      input_ids = inputs['input_ids']
      attention_mask = inputs['attention_mask']

      return {
          'input_ids': torch.tensor(input_ids, dtype=torch.long),
          'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
          'label': label
      }





In [12]:
#Parameters for training

MAX_LEN = 256

In [13]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
#configuration = GPT2Config()


train_dataset = MarketSentimentDataset(X_train, y_train, tokenizer,MAX_LEN)
val_dataset = MarketSentimentDataset(X_val, y_val, tokenizer, MAX_LEN)
test_dataset = MarketSentimentDataset(X_test, y_test, tokenizer, MAX_LEN)

# Create data loaders
batch_size = 1
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=3)
#model = GPT2ForSequenceClassification(configuration).from_pretrained(model_name).to(device)
#model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

Zero Shot Learning using GPT-2

In [None]:
#Training data accuracy
model.eval()
train_labels = []
train_preds = []

with torch.no_grad():
    for batch in tqdm(train_loader, desc="Training Accuracy:"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, 1)

        train_labels.extend(labels.cpu().numpy())
        train_preds.extend(predicted_labels.cpu().numpy())

val_accuracy = accuracy_score(train_labels, train_preds)
print(f'Training Accuracy: {val_accuracy:.4f}')

Training Accuracy:: 100%|██████████| 4673/4673 [01:35<00:00, 48.81it/s]

Training Accuracy: 0.1472





In [None]:
# Validation data accuracy
model.eval()
val_labels = []
val_preds = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, 1)

        val_labels.extend(labels.cpu().numpy())
        val_preds.extend(predicted_labels.cpu().numpy())

val_accuracy = accuracy_score(val_labels, val_preds)
print(f'Validation Accuracy: {val_accuracy:.4f}')

Validation: 100%|██████████| 584/584 [00:10<00:00, 53.47it/s]

Validation Accuracy: 0.1473





In [None]:
# Test Data Accuracy
model.eval()
test_labels = []
test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, 1)

        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(predicted_labels.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_preds)
print(f'Test Accuracy: {test_accuracy:.4f}')

Test: 100%|██████████| 585/585 [00:11<00:00, 51.81it/s]

Test Accuracy: 0.1470





In [None]:
torch.cuda.empty_cache()

**Few Shot Learning**

In [None]:
X_fs_train, X_rem, y_fs_train, y_rem = train_test_split(X_train, y_train, test_size = 1 - 0.01, random_state = 42, stratify = y_train)
print("X_fs_train size: ", X_fs_train.size)
print("y_fs_train size: ", y_fs_train.size)

X_fs_train size:  46
y_fs_train size:  46


In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Load pre-trained RoBERTa model and tokenizer
model_name = 'gpt2'  # You can change this to a different RoBERTa variant if needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes: positive, negative, neutral


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
MAX_LEN = 256



In [None]:
X_fs_dataset = MarketSentimentDataset(X_fs_train, y_fs_train, tokenizer, max_len = MAX_LEN)
fs_train_dataloader = DataLoader(X_fs_dataset, batch_size=1, shuffle=True)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(fs_train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
    #for batch in fs_train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")

    average_loss = total_loss / len(fs_train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")


Epoch 1/3: 100%|██████████| 46/46 [00:04<00:00, 10.96it/s]
Validation: 100%|██████████| 584/584 [00:10<00:00, 54.14it/s]


Validation Accuracy: 50.68%
Epoch 1/3, Average Loss: 2.673862696463323


Epoch 2/3: 100%|██████████| 46/46 [00:04<00:00, 11.18it/s]
Validation: 100%|██████████| 584/584 [00:10<00:00, 53.45it/s]


Validation Accuracy: 53.25%
Epoch 2/3, Average Loss: 0.9891215932110081


Epoch 3/3: 100%|██████████| 46/46 [00:04<00:00, 11.30it/s]
Validation: 100%|██████████| 584/584 [00:11<00:00, 52.75it/s]

Validation Accuracy: 53.60%
Epoch 3/3, Average Loss: 0.7937531171609526





In [None]:
#Evaluation on th fs train set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(fs_train_dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

Training: 100%|██████████| 46/46 [00:00<00:00, 54.31it/s]

Train Accuracy: 69.57%





In [None]:
# Evaluation on the val set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation: 100%|██████████| 584/584 [00:11<00:00, 51.43it/s]

Validation Accuracy: 53.60%





In [None]:
# Evaluation on the test set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test: 100%|██████████| 585/585 [00:11<00:00, 52.21it/s]

Test Accuracy: 53.85%





In [None]:
torch.cuda.empty_cache()

## Fine tuning and increasing dropout to reduce overfitting




In [None]:
class GPT2ForMarketSentimentClassification(torch.nn.Module):
    def __init__(self, model_name):
        super(GPT2ForMarketSentimentClassification,self).__init__()
        config = GPT2Config.from_pretrained(model_name)
        config.num_labels = 3
        config.attention_dropout = 0.15  # Set custom dropout value for attention layers
        config.output_dropout = 0.15
        self.gpt2 = GPT2ForSequenceClassification.from_pretrained(model_name, config=config)

        # Classifier with additional intermediate layers
        '''self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 1024),  # Additional layer 1
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(1024, 512),  # Additional layer 2
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 3)  # Output layer
        )'''

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits


In [None]:
#train model function
def train_model(lr, num_epochs, model):
  optimizer = AdamW(model.parameters(), lr)
  MAX_LEN = 256
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0

      for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
      #for batch in train_loader:
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['label'].to(device)

          optimizer.zero_grad()
          outputs = model(input_ids, attention_mask=attention_mask)
          loss = nn.CrossEntropyLoss()(outputs, labels)
          total_loss += loss

          loss.backward()
          optimizer.step()

      average_loss = total_loss / len(train_loader)
      print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")


In [None]:
# Evaluation function

def evaluate_model(dataloader, description, model):
  model.eval()
  all_preds = []
  all_labels = []

  with torch.no_grad():
      for batch in tqdm(dataloader, desc= description):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['label'].to(device)

          outputs = model(input_ids, attention_mask=attention_mask)
          preds = torch.argmax(outputs, dim=1).cpu().numpy()

          all_preds.extend(preds)
          all_labels.extend(labels.cpu().numpy())

  accuracy = accuracy_score(all_labels, all_preds)
  print(f"{description} accuracy: {accuracy * 100:.2f}%")

In [None]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
#model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = GPT2ForMarketSentimentClassification(model_name)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2ForMarketSentimentClassification(
  (gpt2): GPT2ForSequenceClassification(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (score): Linear(in_features=768, out_features=3, bias=False)
  )
)

In [None]:
train_model(2e-5, 3, model)
evaluate_model(train_loader, 'Training', model)
evaluate_model(val_loader, 'Validation', model)
evaluate_model(test_loader, 'Testing', model)


Epoch 1/3: 100%|██████████| 4673/4673 [06:57<00:00, 11.20it/s]


Epoch 1/3, Average Loss: 0.7184796929359436


Epoch 2/3: 100%|██████████| 4673/4673 [06:57<00:00, 11.20it/s]


Epoch 2/3, Average Loss: 0.44391295313835144


Epoch 3/3: 100%|██████████| 4673/4673 [06:56<00:00, 11.21it/s]


Epoch 3/3, Average Loss: 0.30793675780296326


Training: 100%|██████████| 4673/4673 [01:29<00:00, 51.93it/s]


Training accuracy: 89.34%


Validation: 100%|██████████| 584/584 [00:11<00:00, 51.74it/s]


Validation accuracy: 77.91%


Testing: 100%|██████████| 585/585 [00:11<00:00, 50.86it/s]

Testing accuracy: 80.17%





In [None]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/Market_Sentiment_Analysis/market_sentiment_analysis_weights.csv')

In [None]:
torch.cuda.empty_cache()

## Fine Tuning by adding a layer

In [15]:
class GPT2ForMarketSentimentClassification(torch.nn.Module):
    def __init__(self, model_name):
        super(GPT2ForMarketSentimentClassification,self).__init__()
        config = GPT2Config.from_pretrained(model_name)
        config.num_labels = 3
        config.attention_dropout = 0.15  # Set custom dropout value for attention layers
        config.output_dropout = 0.15
        self.gpt2 = GPT2ForSequenceClassification.from_pretrained(model_name, config=config)
        self.gpt2.score = nn.Sequential(nn.Linear(in_features=768, out_features=364, bias=False),
                                        nn.Linear(in_features=364, out_features=3, bias=False))

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits


In [16]:
#train model function
def train_model(lr, num_epochs, model):
  optimizer = AdamW(model.parameters(), lr)
  MAX_LEN = 256
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0

      for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
      #for batch in train_loader:
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['label'].to(device)

          optimizer.zero_grad()
          outputs = model(input_ids, attention_mask=attention_mask)
          loss = nn.CrossEntropyLoss()(outputs, labels)
          total_loss += loss

          loss.backward()
          optimizer.step()

      average_loss = total_loss / len(train_loader)
      print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")


In [17]:
# Evaluation function

def evaluate_model(dataloader, description, model):
  model.eval()
  all_preds = []
  all_labels = []

  with torch.no_grad():
      for batch in tqdm(dataloader, desc= description):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['label'].to(device)

          outputs = model(input_ids, attention_mask=attention_mask)
          preds = torch.argmax(outputs, dim=1).cpu().numpy()

          all_preds.extend(preds)
          all_labels.extend(labels.cpu().numpy())

  accuracy = accuracy_score(all_labels, all_preds)
  print(f"{description} accuracy: {accuracy * 100:.2f}%")

In [18]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
#model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = GPT2ForMarketSentimentClassification(model_name)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2ForMarketSentimentClassification(
  (gpt2): GPT2ForSequenceClassification(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (score): Sequential(
      (0): Linear(in_features=768, out_features=

In [25]:
learning_rates = [ 2e-3, 2e-5, 2e-6]

for lr in learning_rates:
  print( "Learning Rate:"  + str(lr))
  train_model(lr, 3, model)
  evaluate_model(train_loader, 'Training', model)
  evaluate_model(val_loader, 'Validation', model)
  evaluate_model(test_loader, 'Testing', model)



Learning Rate:0.002


Epoch 1/3: 100%|██████████| 4673/4673 [06:57<00:00, 11.18it/s]


Epoch 1/3, Average Loss: 1.0600528717041016


Epoch 2/3: 100%|██████████| 4673/4673 [06:52<00:00, 11.32it/s]


Epoch 2/3, Average Loss: 1.0221596956253052


Epoch 3/3: 100%|██████████| 4673/4673 [07:00<00:00, 11.10it/s]


Epoch 3/3, Average Loss: 0.935643196105957


Training: 100%|██████████| 4673/4673 [01:27<00:00, 53.35it/s]


Training accuracy: 62.66%


Validation: 100%|██████████| 584/584 [00:10<00:00, 53.66it/s]


Validation accuracy: 58.22%


Testing: 100%|██████████| 585/585 [00:11<00:00, 52.54it/s]


Testing accuracy: 57.61%
Learning Rate:2e-05


Epoch 1/3: 100%|██████████| 4673/4673 [06:54<00:00, 11.28it/s]


Epoch 1/3, Average Loss: 0.8159314393997192


Epoch 2/3: 100%|██████████| 4673/4673 [07:09<00:00, 10.87it/s]


Epoch 2/3, Average Loss: 0.8027114272117615


Epoch 3/3: 100%|██████████| 4673/4673 [06:51<00:00, 11.36it/s]


Epoch 3/3, Average Loss: 0.797171413898468


Training: 100%|██████████| 4673/4673 [01:27<00:00, 53.44it/s]


Training accuracy: 66.00%


Validation: 100%|██████████| 584/584 [00:11<00:00, 52.98it/s]


Validation accuracy: 57.19%


Testing: 100%|██████████| 585/585 [00:10<00:00, 53.38it/s]


Testing accuracy: 58.63%
Learning Rate:2e-06


Epoch 1/3: 100%|██████████| 4673/4673 [07:12<00:00, 10.82it/s]


Epoch 1/3, Average Loss: 0.791845440864563


Epoch 2/3: 100%|██████████| 4673/4673 [06:49<00:00, 11.40it/s]


Epoch 2/3, Average Loss: 0.7937092185020447


Epoch 3/3: 100%|██████████| 4673/4673 [06:48<00:00, 11.43it/s]


Epoch 3/3, Average Loss: 0.7930886149406433


Training: 100%|██████████| 4673/4673 [01:26<00:00, 54.12it/s]


Training accuracy: 66.23%


Validation: 100%|██████████| 584/584 [00:10<00:00, 53.50it/s]


Validation accuracy: 57.36%


Testing: 100%|██████████| 585/585 [00:10<00:00, 55.07it/s]

Testing accuracy: 58.63%





In [26]:
epochs = [3,5]

for num_epochs in epochs:
  train_model(2e-6, num_epochs, model)
  evaluate_model(train_loader, 'Training', model)
  evaluate_model(val_loader, 'Validation', model)
  evaluate_model(test_loader, 'Testing', model)


Epoch 1/3: 100%|██████████| 4673/4673 [07:03<00:00, 11.03it/s]


Epoch 1/3, Average Loss: 0.7889467477798462


Epoch 2/3: 100%|██████████| 4673/4673 [06:50<00:00, 11.37it/s]


Epoch 2/3, Average Loss: 0.7951905131340027


Epoch 3/3: 100%|██████████| 4673/4673 [06:50<00:00, 11.39it/s]


Epoch 3/3, Average Loss: 0.7895658016204834


Training: 100%|██████████| 4673/4673 [01:26<00:00, 53.81it/s]


Training accuracy: 66.25%


Validation: 100%|██████████| 584/584 [00:11<00:00, 52.86it/s]


Validation accuracy: 57.88%


Testing: 100%|██████████| 585/585 [00:10<00:00, 54.17it/s]


Testing accuracy: 58.63%


Epoch 1/5: 100%|██████████| 4673/4673 [06:49<00:00, 11.42it/s]


Epoch 1/5, Average Loss: 0.7870808839797974


Epoch 2/5: 100%|██████████| 4673/4673 [06:49<00:00, 11.40it/s]


Epoch 2/5, Average Loss: 0.7861886620521545


Epoch 3/5: 100%|██████████| 4673/4673 [06:50<00:00, 11.38it/s]


Epoch 3/5, Average Loss: 0.7835069298744202


Epoch 4/5: 100%|██████████| 4673/4673 [06:48<00:00, 11.44it/s]


Epoch 4/5, Average Loss: 0.7887039184570312


Epoch 5/5: 100%|██████████| 4673/4673 [06:48<00:00, 11.43it/s]


Epoch 5/5, Average Loss: 0.7883259057998657


Training: 100%|██████████| 4673/4673 [01:25<00:00, 54.37it/s]


Training accuracy: 66.47%


Validation: 100%|██████████| 584/584 [00:10<00:00, 53.31it/s]


Validation accuracy: 58.22%


Testing: 100%|██████████| 585/585 [00:10<00:00, 54.84it/s]

Testing accuracy: 58.63%





In [None]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/Market_Sentiment_Analysis/market_sentiment_analysis_weights.csv')

In [22]:
torch.cuda.empty_cache()