<a href="https://colab.research.google.com/github/srini11govind/skills-github-pages/blob/main/milestone3%264.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from nltk.corpus import stopwords
from collections import Counter
import nltk

In [None]:
df = pd.read_csv("/content/train.csv")
print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (6827, 8)


Unnamed: 0,id,text,anger,fear,joy,sadness,surprise,emotions
0,0,the dentist that did the work apparently did a...,1,0,0,1,0,['anger' 'sadness']
1,1,i'm gonna absolutely ~~suck~~ be terrible duri...,0,1,0,1,0,['fear' 'sadness']
2,2,"bridge: so leave me drowning calling houston, ...",0,1,0,1,0,['fear' 'sadness']
3,3,after that mess i went to see my now ex-girlfr...,1,1,0,1,0,['anger' 'fear' 'sadness']
4,4,"as he stumbled i ran off, afraid it might some...",0,1,0,0,0,['fear']


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [None]:
original_char_count = df['text'].apply(len).sum()
df['clean_text'] = df['text'].apply(clean_text)
clean_char_count = df['clean_text'].apply(len).sum()
char_reduction_pct = ((original_char_count - clean_char_count) / original_char_count) * 100
print(f"Percentage reduction in total character count after removing punctuation: {char_reduction_pct:.2f}%")

Percentage reduction in total character count after removing punctuation: 3.26%


In [None]:
stop_words = set(stopwords.words('english'))
all_words = " ".join(df['clean_text']).split()
unique_words = set(all_words)
stopword_in_unique = [w for w in unique_words if w in stop_words]
stopword_percentage = (len(stopword_in_unique) / len(unique_words)) * 100
print(f"Percentage of unique words that are stop words: {stopword_percentage:.2f}%")

Percentage of unique words that are stop words: 1.54%


In [None]:

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

df['text_clean'] = df['text'].apply(clean_text)

In [None]:
labels = ["anger", "fear", "joy", "sadness", "surprise"]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import wandb
from sklearn.metrics import f1_score


wandb.init(project="emotion-classification", name="SimpleNN-Embedding")


class EmotionDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=50):
        self.texts = texts
        self.labels = labels.values.astype(float)
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def text_to_seq(self, text):
        tokens = text.lower().split()[:self.max_len]
        seq = [self.vocab.get(tok, self.vocab['<UNK>']) for tok in tokens]

        seq += [self.vocab['<PAD>']] * (self.max_len - len(seq))
        return torch.tensor(seq)

    def __getitem__(self, idx):
        seq = self.text_to_seq(self.texts[idx])
        label = torch.tensor(self.labels[idx])
        return seq, label

class SimpleNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_labels):
        super(SimpleNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * max_len, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.view(embeds.size(0), -1)
        out = self.relu(self.fc1(embeds))
        out = self.sigmoid(self.fc2(out))
        return out


from collections import Counter
all_tokens = ' '.join(df['text_clean']).split()
counter = Counter(all_tokens)
vocab = {word: idx+2 for idx, word in enumerate(counter)}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

max_len = 50
embed_dim = 50
num_labels = 5

# Dataset and dataloaders
dataset = EmotionDataset(df['text_clean'].tolist(), df[labels], vocab, max_len)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32)

# Model, loss, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleNN(len(vocab), embed_dim, num_labels).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device).float()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device).float()
            outputs = model(inputs)
            val_preds.append(outputs.cpu())
            val_targets.append(targets.cpu())

    val_preds = torch.cat(val_preds).numpy()
    val_preds_bin = (val_preds > 0.5).astype(int)
    val_targets = torch.cat(val_targets).numpy()

    val_f1 = f1_score(val_targets, val_preds_bin, average='macro')

    # Log metrics to W&B
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_macro_f1": val_f1
    })

    print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Macro F1={val_f1:.4f}")

wandb.finish()

Epoch 1: Train Loss=0.5778, Val Macro F1=0.2072
Epoch 2: Train Loss=0.4570, Val Macro F1=0.3804
Epoch 3: Train Loss=0.3580, Val Macro F1=0.4697
Epoch 4: Train Loss=0.2702, Val Macro F1=0.5309
Epoch 5: Train Loss=0.2009, Val Macro F1=0.5771


0,1
epoch,▁▃▅▆█
train_loss,█▆▄▂▁
val_macro_f1,▁▄▆▇█

0,1
epoch,5.0
train_loss,0.20088
val_macro_f1,0.57709


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import wandb
from sklearn.metrics import f1_score

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels, max_len):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # Last timestep output
        out = self.fc(out)
        return self.sigmoid(out)

# Initialize W&B
#wandb.init(project="emotion-classification", name="LSTM-Model")

# Hyperparameters
hidden_dim = 64
embed_dim = 50
max_len = 50
num_labels = 5
batch_size = 32
epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Instantiate model
model = LSTMModel(len(vocab), embed_dim, hidden_dim, num_labels, max_len).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Assuming train_loader and val_loader are DataLoaders as from previous steps

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device).float()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device).float()
            outputs = model(inputs)
            val_preds.append(outputs.cpu())
            val_targets.append(targets.cpu())

    val_preds = torch.cat(val_preds).numpy()
    val_preds_bin = (val_preds > 0.5).astype(int)
    val_targets = torch.cat(val_targets).numpy()
    val_f1 = f1_score(val_targets, val_preds_bin, average='macro')

    """wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_macro_f1": val_f1
    })"""

    print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Macro F1={val_f1:.4f}")

#wandb.finish()

Epoch 1: Train Loss=0.5815, Val Macro F1=0.1435
Epoch 2: Train Loss=0.5685, Val Macro F1=0.1435
Epoch 3: Train Loss=0.5686, Val Macro F1=0.1435
Epoch 4: Train Loss=0.5687, Val Macro F1=0.1435
Epoch 5: Train Loss=0.5683, Val Macro F1=0.1435


In [None]:
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels, max_len):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return self.sigmoid(out)

In [None]:

model = GRUModel(len(vocab), embed_dim, hidden_dim, num_labels, max_len).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



for epoch in range(epochs):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device).float()
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device).float()
            outputs = model(inputs)
            val_preds.append(outputs.cpu())
            val_targets.append(targets.cpu())

    val_preds = torch.cat(val_preds).numpy()
    val_preds_bin = (val_preds > 0.5).astype(int)
    val_targets = torch.cat(val_targets).numpy()
    val_f1 = f1_score(val_targets, val_preds_bin, average='macro')

    """wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_macro_f1": val_f1
    })"""

    print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Macro F1={val_f1:.4f}")

Epoch 1: Train Loss=0.5816, Val Macro F1=0.1435
Epoch 2: Train Loss=0.5684, Val Macro F1=0.1435
Epoch 3: Train Loss=0.5655, Val Macro F1=0.1435
Epoch 4: Train Loss=0.5470, Val Macro F1=0.1281
Epoch 5: Train Loss=0.5180, Val Macro F1=0.1972


In [4]:
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-uncased')
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Total parameters: 109482240
