## Imports

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import re
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


!pip install emoji

import re
import emoji
nltk.download('words')
words = set(nltk.corpus.words.words())

## Data Pre-Processing

In [None]:
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if not emoji.is_emoji(c)) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         if w.lower() in words)
    return tweet


train_df = pd.read_csv("train (1).csv")
test_df = pd.read_csv("test (1).csv")


train_df['tweet'] = train_df['tweet'].map(lambda x: cleaner(x))
test_df['tweet'] = test_df['tweet'].map(lambda x: cleaner(x))




train_df['tokens'] = train_df['tweet'].apply(lambda x: x.split())
test_df['tokens'] = test_df['tweet'].apply(lambda x: x.split())





train_df['label'] = train_df['sentiment'].map({'Negative': 0, 'Positive': 1})
test_df['label'] = test_df['sentiment'].map({'Negative': 0, 'Positive': 1})


## GloVe Embedding setup

In [None]:
glove_path = "glove.6B.200d.txt"

glove_embeddings = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vector = torch.tensor([float(val) for val in parts[1:]], dtype=torch.float)
        glove_embeddings[word] = vector


# Create vocab from training tokens only
vocab = {'<PAD>': 0, '<UNK>': 1}
for token in set(t for tokens in train_df['tokens'] for t in tokens):
    if token in glove_embeddings:
        vocab[token] = len(vocab)

embedding_matrix = torch.zeros(len(vocab), 200)
embedding_matrix[1] = torch.randn(200)  # For UNK
for word, idx in vocab.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]

## Dataset Preparation

In [None]:
class TweetDataset(Dataset):
    def __init__(self, tokens, labels=None):
        self.sequences = [torch.tensor([vocab.get(t, 1) for t in tok], dtype=torch.long) for tok in tokens]
        self.labels = torch.tensor(labels, dtype=torch.float32) if labels is not None else None

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.sequences[idx], self.labels[idx]
        return self.sequences[idx]

def collate_fn(batch):
    if isinstance(batch[0], tuple):
        sequences, labels = zip(*batch)
        # Manually pad sequences
        max_len = max(len(seq) for seq in sequences)
        padded = []
        for seq in sequences:
            padded_seq = torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)])
            padded.append(padded_seq)
        padded = torch.stack(padded).t()  # shape: (seq_len, batch)
        return padded, torch.tensor(labels)
    else:
        max_len = max(len(seq) for seq in batch)
        padded = []
        for seq in batch:
            padded_seq = torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)])
            padded.append(padded_seq)
        padded = torch.stack(padded).t()
        return padded





train_dataset = TweetDataset(train_df['tokens'], train_df['label'])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataset = TweetDataset(test_df['tokens'], test_df['label'])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

## Model Definition, Training and Evaluation

In [None]:
class RNNModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size=25):
        super(RNNModel, self).__init__()
        vocab_size, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_size)  # batch_first=False is default
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.embedding(x)  # (seq_len, batch, embed_dim)
        out, _ = self.rnn(x)
        return self.fc(out[-1]).squeeze(1)





device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNModel(embedding_matrix).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)





train_losses = []
train_accuracies = []
test_accuracies = []


for epoch in range(1, 11):
    model.train()
    total_loss = 0
    correct_train, total_train = 0, 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = (torch.sigmoid(outputs) > 0.5).long().squeeze()
        correct_train += (preds == batch_y).sum().item()
        total_train += batch_y.size(0)

    train_acc = 100 * correct_train / total_train
    train_losses.append(total_loss)
    train_accuracies.append(train_acc)

    # --- Evaluation on Test Set ---
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            preds = (torch.sigmoid(outputs) > 0.5).long().squeeze()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    test_acc = accuracy_score(all_labels, all_preds) * 100
    test_accuracies.append(test_acc)

    # Optional: F1, Precision, Recall per epoch (print or store)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    print(f"Epoch {epoch}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%, F1: {f1:.2f}")







# Plot Train Loss and Accuracy
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(train_losses, label="Train Loss")
plt.title("Train Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1,2,2)
plt.plot(train_accuracies, label="Train Acc")
plt.plot(test_accuracies, label="Test Acc")
plt.title("Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy (%)")
plt.legend()

plt.tight_layout()
plt.show()


## Biased Data -> Unbiased Data

In [None]:
neg_df = train_df[train_df['label'] == 0]  # assuming 0 = Negative
pos_df = train_df[train_df['label'] == 1]  # assuming 1 = Positive

# Undersample the Negative class to match the size of the Positive class
neg_df_sampled = neg_df.sample(n=len(pos_df), random_state=42)

# Combine to get a balanced dataset
balanced_train_df = pd.concat([pos_df, neg_df_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)



neg_df = test_df[test_df['label'] == 0]  # assuming 0 = Negative
pos_df = test_df[test_df['label'] == 1]  # assuming 1 = Positive

# Undersample the Negative class to match the size of the Positive class
neg_df_sampled = neg_df.sample(n=len(pos_df), random_state=42)

# Combine to get a balanced dataset
balanced_test_df = pd.concat([pos_df, neg_df_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)



train_dataset = TweetDataset(balanced_train_df['tokens'], balanced_train_df['label'])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataset = TweetDataset(balanced_test_df['tokens'], balanced_test_df['label'])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNModel(embedding_matrix).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
train_losses = []
train_accuracies = []
test_accuracies = []


for epoch in range(1, 21):
    model.train()
    total_loss = 0
    correct_train, total_train = 0, 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = (torch.sigmoid(outputs) > 0.5).long().squeeze()
        correct_train += (preds == batch_y).sum().item()
        total_train += batch_y.size(0)

    train_acc = 100 * correct_train / total_train
    train_losses.append(total_loss)
    train_accuracies.append(train_acc)

    # --- Evaluation on Test Set ---
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            preds = (torch.sigmoid(outputs) > 0.5).long().squeeze()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_y.cpu().numpy())

    test_acc = accuracy_score(all_labels, all_preds) * 100
    test_accuracies.append(test_acc)

    # Optional: F1, Precision, Recall per epoch (print or store)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    print(f"Epoch {epoch}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%, F1: {f1:.2f}")




# Plot Train Loss and Accuracy
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(train_losses, label="Train Loss")
plt.title("Train Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1,2,2)
plt.plot(train_accuracies, label="Train Acc")
plt.plot(test_accuracies, label="Test Acc")
plt.title("Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy (%)")
plt.legend()

plt.tight_layout()
plt.show()
