**Preprocesssing**

In [1]:
import re
import numpy as np
import gensim.downloader as api
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

word2vec_model = api.load("word2vec-google-news-300")



In [2]:
import pandas as pd
file_path = 'Combined Data.csv'
df = pd.read_csv(file_path)
def preprocess_and_tokenize(text):
    if not isinstance(text, str):
        return []

    text = re.sub(r"[^A-Za-z0-9\s]", "", text.lower())
    tokens = simple_preprocess(text, deacc=True)
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tokens

df['tokens'] = df['statement'].apply(preprocess_and_tokenize)
def get_average_word2vec(tokens, model, vector_size=300):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

df['word2vec_vector'] = df['tokens'].apply(lambda x: get_average_word2vec(x, word2vec_model))


df['status'] = df['status'].apply(lambda x: 0 if x == 'Normal' else 1)

df['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
1,36692
0,16351


**BASELINE MODEL**

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np

X = np.stack(df['word2vec_vector'].values)
y = df['status'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

X_train = X_train.unsqueeze(1)
X_test = X_test.unsqueeze(1)

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(F.relu(self.fc1(out[:, -1, :])))
        out = torch.sigmoid(self.fc2(out))
        return out

input_size = X_train.shape[2]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        predicted = (outputs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        running_loss += loss.item()

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")

model.eval()
correct = 0
total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)

        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_true, y_pred))
roc_auc = roc_auc_score(y_true, y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")


Epoch [1/10], Loss: 0.3525, Accuracy: 86.07%
Epoch [2/10], Loss: 0.2756, Accuracy: 89.58%
Epoch [3/10], Loss: 0.2440, Accuracy: 90.70%
Epoch [4/10], Loss: 0.2238, Accuracy: 91.40%
Epoch [5/10], Loss: 0.2120, Accuracy: 91.92%
Epoch [6/10], Loss: 0.2038, Accuracy: 92.16%
Epoch [7/10], Loss: 0.1956, Accuracy: 92.43%
Epoch [8/10], Loss: 0.1924, Accuracy: 92.52%
Epoch [9/10], Loss: 0.1848, Accuracy: 92.72%
Epoch [10/10], Loss: 0.1799, Accuracy: 92.93%
Test Accuracy: 92.69%
              precision    recall  f1-score   support

         0.0       0.91      0.86      0.88      3327
         1.0       0.94      0.96      0.95      7282

    accuracy                           0.93     10609
   macro avg       0.92      0.91      0.91     10609
weighted avg       0.93      0.93      0.93     10609

ROC-AUC: 0.9079


**FOCAL LOSS**

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np

X = np.stack(df['word2vec_vector'].values)
y = df['status'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

X_train = X_train.unsqueeze(1)
X_test = X_test.unsqueeze(1)

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(F.relu(self.fc1(out[:, -1, :])))
        out = torch.sigmoid(self.fc2(out))
        return out

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

input_size = X_train.shape[2]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size).to(device)
criterion = FocalLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        predicted = (outputs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        running_loss += loss.item()

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")

model.eval()
correct = 0
total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)

        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_true, y_pred))
roc_auc = roc_auc_score(y_true, y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")



Epoch [1/10], Loss: 0.0234, Accuracy: 86.01%
Epoch [2/10], Loss: 0.0185, Accuracy: 89.80%
Epoch [3/10], Loss: 0.0164, Accuracy: 90.70%
Epoch [4/10], Loss: 0.0151, Accuracy: 91.27%
Epoch [5/10], Loss: 0.0145, Accuracy: 91.64%
Epoch [6/10], Loss: 0.0141, Accuracy: 91.95%
Epoch [7/10], Loss: 0.0135, Accuracy: 92.17%
Epoch [8/10], Loss: 0.0130, Accuracy: 92.40%
Epoch [9/10], Loss: 0.0126, Accuracy: 92.59%
Epoch [10/10], Loss: 0.0123, Accuracy: 92.76%
Test Accuracy: 92.62%
              precision    recall  f1-score   support

         0.0       0.89      0.87      0.88      3327
         1.0       0.94      0.95      0.95      7282

    accuracy                           0.93     10609
   macro avg       0.92      0.91      0.91     10609
weighted avg       0.93      0.93      0.93     10609

ROC-AUC: 0.9119


**WEIGHTED BCE**

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np

X = np.stack(df['word2vec_vector'].values)
y = df['status'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

X_train = X_train.unsqueeze(1)
X_test = X_test.unsqueeze(1)

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(F.relu(self.fc1(out[:, -1, :])))
        out = torch.sigmoid(self.fc2(out))
        return out

class WeightedBCELoss(nn.Module):
    def __init__(self, pos_weight):
        super(WeightedBCELoss, self).__init__()
        self.pos_weight = torch.tensor(pos_weight, dtype=torch.float32)

    def forward(self, inputs, targets):
        return F.binary_cross_entropy(inputs, targets, weight=self.pos_weight.to(inputs.device))

pos_weight = (y_tensor == 0).sum().float() / (y_tensor == 1).sum().float()
criterion = WeightedBCELoss(pos_weight)

input_size = X_train.shape[2]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMModel(input_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        predicted = (outputs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        running_loss += loss.item()

    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")

model.eval()
correct = 0
total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)

        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_true, y_pred))
roc_auc = roc_auc_score(y_true, y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")


  self.pos_weight = torch.tensor(pos_weight, dtype=torch.float32)


Epoch [1/10], Loss: 0.1584, Accuracy: 86.02%
Epoch [2/10], Loss: 0.1233, Accuracy: 89.55%
Epoch [3/10], Loss: 0.1088, Accuracy: 90.60%
Epoch [4/10], Loss: 0.1003, Accuracy: 91.32%
Epoch [5/10], Loss: 0.0957, Accuracy: 91.73%
Epoch [6/10], Loss: 0.0911, Accuracy: 92.05%
Epoch [7/10], Loss: 0.0881, Accuracy: 92.30%
Epoch [8/10], Loss: 0.0845, Accuracy: 92.65%
Epoch [9/10], Loss: 0.0830, Accuracy: 92.63%
Epoch [10/10], Loss: 0.0807, Accuracy: 92.92%
Test Accuracy: 92.71%
              precision    recall  f1-score   support

         0.0       0.88      0.89      0.88      3327
         1.0       0.95      0.94      0.95      7282

    accuracy                           0.93     10609
   macro avg       0.91      0.92      0.92     10609
weighted avg       0.93      0.93      0.93     10609

ROC-AUC: 0.9178


**Batch Size Variation**

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np

X = np.stack(df['word2vec_vector'].values)
y = df['status'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

X_train = X_train.unsqueeze(1)
X_test = X_test.unsqueeze(1)

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

batch_sizes = [32, 64, 256, 1024, 3200]
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(F.relu(self.fc1(out[:, -1, :])))
        out = torch.sigmoid(self.fc2(out))
        return out

class WeightedBCELoss(nn.Module):
    def __init__(self, pos_weight):
        super(WeightedBCELoss, self).__init__()
        self.pos_weight = torch.tensor(pos_weight, dtype=torch.float32)

    def forward(self, inputs, targets):
        return F.binary_cross_entropy(inputs, targets, weight=self.pos_weight.to(inputs.device))

pos_weight = (y_tensor == 0).sum().float() / (y_tensor == 1).sum().float()
criterion = WeightedBCELoss(pos_weight)

for batch_size in batch_sizes:
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    model = LSTMModel(input_size=X_train.shape[2]).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            running_loss += loss.item()

        accuracy = 100 * correct / total
        print(f"Batch Size: {batch_size}, Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")

    model.eval()
    correct = 0
    total = 0
    test_loss = 0.0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device).unsqueeze(1)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    test_accuracy = 100 * correct / total
    print(f"Batch Size: {batch_size}, Test Loss: {test_loss/len(test_loader):.4f}, Test Accuracy: {test_accuracy:.2f}%")

from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_true, y_pred))
roc_auc = roc_auc_score(y_true, y_pred)
print(f"ROC-AUC: {roc_auc:.4f}")


  self.pos_weight = torch.tensor(pos_weight, dtype=torch.float32)


Batch Size: 32, Epoch [1/10], Loss: 0.1589, Accuracy: 85.73%
Batch Size: 32, Epoch [2/10], Loss: 0.1236, Accuracy: 89.57%
Batch Size: 32, Epoch [3/10], Loss: 0.1080, Accuracy: 90.63%
Batch Size: 32, Epoch [4/10], Loss: 0.1001, Accuracy: 91.47%
Batch Size: 32, Epoch [5/10], Loss: 0.0954, Accuracy: 91.83%
Batch Size: 32, Epoch [6/10], Loss: 0.0906, Accuracy: 92.24%
Batch Size: 32, Epoch [7/10], Loss: 0.0878, Accuracy: 92.33%
Batch Size: 32, Epoch [8/10], Loss: 0.0856, Accuracy: 92.61%
Batch Size: 32, Epoch [9/10], Loss: 0.0835, Accuracy: 92.83%
Batch Size: 32, Epoch [10/10], Loss: 0.0804, Accuracy: 92.92%
Batch Size: 32, Test Loss: 0.0816, Test Accuracy: 92.72%
Batch Size: 64, Epoch [1/10], Loss: 0.1667, Accuracy: 84.61%
Batch Size: 64, Epoch [2/10], Loss: 0.1295, Accuracy: 89.07%
Batch Size: 64, Epoch [3/10], Loss: 0.1143, Accuracy: 90.22%
Batch Size: 64, Epoch [4/10], Loss: 0.1058, Accuracy: 90.92%
Batch Size: 64, Epoch [5/10], Loss: 0.0986, Accuracy: 91.44%
Batch Size: 64, Epoch [6/10