In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
cd drive

[Errno 2] No such file or directory: 'drive'
/content/drive/My Drive/NLP


In [8]:
cd My \Drive

[Errno 2] No such file or directory: 'My Drive'
/content/drive/My Drive/NLP


In [9]:
cd NLP/

[Errno 2] No such file or directory: 'NLP/'
/content/drive/My Drive/NLP


In [10]:
# Load libraries
import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('./Data/KaggleData.csv')

# Convert to lowercase, remove punctuation, extra spaces, URLs, mentions, and hashtags
df['tweet'] = df['tweet'].str.lower().replace(r'[^\w\s]', '', regex=True).replace(' {2,}', ' ', regex=True).replace('"', '')
df['tweet'] = df['tweet'].replace(r'http\S+|www.\S+|@\w+|#\w+', '', regex=True)

# Tokenization
nltk.download('punkt')
df['tweet'] = df['tweet'].apply(nltk.word_tokenize)

# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['tweet'] = df['tweet'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Removing stop-words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Encode the labels
# 0 - hate speech, 1 - offensive language, 2 - neither as positive or negative
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['class'])

# Splitting the Data using Stratified split
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], y, test_size=0.3, stratify=y, random_state=42)

# Tokenize and pad the input sequences
def tokenize_and_pad(texts, maxlen=100):
    tokenized_texts = [nltk.word_tokenize(text) for text in texts]
    return pad_sequence([torch.tensor([word_to_index[word] for word in text if word in word_to_index][:maxlen]) for text in tokenized_texts], batch_first=True, padding_value=len(word_to_index))

word_to_index = {word: i for i, word in enumerate(set(df['tweet'].str.cat(sep=' ').split()), 1)}
X_train = tokenize_and_pad(X_train)
X_test = tokenize_and_pad(X_test)

# Create PyTorch Datasets and DataLoaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Create a PyTorch LSTM model
class LSTMBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        packed_output, (hidden, cell) = self.lstm(x)
        x = self.fc(hidden[-1])
        return x

# Initialize the model, optimizer, and loss function
model = LSTMBaseline(len(word_to_index) + 1, 50, 100, len(set(y)))
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
    
# Train the model
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(train_loader)}")

    # Save Model
    torch.save(model, './Weights/KaggleLSTM.pth')

# Test the model and collect predictions and true labels
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.numpy())
        true_labels.extend(labels.numpy())

# Calculate accuracy, precision, recall, F1-score, and confusion matrix
accuracy = np.mean(np.array(predictions) == np.array(true_labels))
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
conf_mat = confusion_matrix(true_labels, predictions)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1_score)
print("Confusion Matrix:\n", conf_mat)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10, Loss: 0.6682774611160461
Epoch 2/10, Loss: 0.6648849582035458
Epoch 3/10, Loss: 0.4339797409391974
Epoch 4/10, Loss: 0.34845726736040844
Epoch 5/10, Loss: 0.30326250335458893
Epoch 6/10, Loss: 0.26482036016181687
Epoch 7/10, Loss: 0.2358285127341418
Epoch 8/10, Loss: 0.20508714285413315
Epoch 9/10, Loss: 0.18264081944500543
Epoch 10/10, Loss: 0.1611840561234682
Accuracy:  0.8664425016812374
Precision:  0.8530193150878194
Recall:  0.8664425016812374
F1-score:  0.8577859260533811
Confusion Matrix:
 [[  91  281   57]
 [ 100 5398  259]
 [  33  263  953]]
