**PREPROCESSING**

In [1]:
import re
import numpy as np
import gensim.downloader as api
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

word2vec_model = api.load("word2vec-google-news-300")

import pandas as pd
file_path = 'Combined Data.csv'
df = pd.read_csv(file_path)
def preprocess_and_tokenize(text):
    if not isinstance(text, str):
        return []

    text = re.sub(r"[^A-Za-z0-9\s]", "", text.lower())
    tokens = simple_preprocess(text, deacc=True)
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tokens

df['tokens'] = df['statement'].apply(preprocess_and_tokenize)
def get_average_word2vec(tokens, model, vector_size=300):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

df['word2vec_vector'] = df['tokens'].apply(lambda x: get_average_word2vec(x, word2vec_model))


df['status'] = df['status'].apply(lambda x: 0 if x == 'Normal' else 1)

df['status'].value_counts()



Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
1,22926
0,11324


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import torch
import numpy as np

X = np.stack(df['word2vec_vector'].values)
y = df['status'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)



**LSTM+DT**

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.5):
        super(LSTMFeatureExtractor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(1)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

class SoftDecisionTree(nn.Module):
    def __init__(self, input_dim, num_classes, num_nodes=10):
        super(SoftDecisionTree, self).__init__()
        self.inner_nodes = nn.Linear(input_dim, num_nodes)
        self.leaf_nodes = nn.Linear(num_nodes, num_classes)

    def forward(self, x):
        decision_weights = torch.sigmoid(self.inner_nodes(x))
        leaf_outputs = self.leaf_nodes(decision_weights)
        return leaf_outputs

class LSTMDecisionTree(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_nodes=10):
        super(LSTMDecisionTree, self).__init__()
        self.feature_extractor = LSTMFeatureExtractor(input_dim, hidden_size=hidden_dim)
        self.decision_tree = SoftDecisionTree(hidden_dim, num_classes, num_nodes)

    def forward(self, x):
        features = self.feature_extractor(x)
        output = self.decision_tree(features)
        return output

class CustomDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def train_model(model, train_loader, test_loader, epochs=10, lr=0.001):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

        train_acc = 100 * correct / total
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Train Accuracy: {train_acc:.2f}%")

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

batch_size = 64
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

input_dim = X_train.shape[1]
num_classes = len(torch.unique(torch.tensor(y_train)))
model = LSTMDecisionTree(input_dim=input_dim, hidden_dim=128, num_classes=num_classes, num_nodes=10)

# Train the model
train_model(model, train_loader, test_loader, epochs=10, lr=0.001)


  self.X = torch.tensor(X, dtype=torch.float32)
  self.y = torch.tensor(y, dtype=torch.long)
  num_classes = len(torch.unique(torch.tensor(y_train)))


Epoch 1, Loss: 0.3881, Train Accuracy: 84.33%
Epoch 2, Loss: 0.2908, Train Accuracy: 89.41%
Epoch 3, Loss: 0.2568, Train Accuracy: 90.67%
Epoch 4, Loss: 0.2328, Train Accuracy: 91.42%
Epoch 5, Loss: 0.2136, Train Accuracy: 92.20%
Epoch 6, Loss: 0.2017, Train Accuracy: 92.50%
Epoch 7, Loss: 0.1938, Train Accuracy: 92.75%
Epoch 8, Loss: 0.1850, Train Accuracy: 93.18%
Epoch 9, Loss: 0.1776, Train Accuracy: 93.24%
Epoch 10, Loss: 0.1740, Train Accuracy: 93.58%
Test Accuracy: 92.55%


**LSTM+SVM**

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.5):
        super(LSTMFeatureExtractor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(1)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

class SoftSVM(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SoftSVM, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.fc(x)

class LSTMSVM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(LSTMSVM, self).__init__()
        self.feature_extractor = LSTMFeatureExtractor(input_dim, hidden_dim)
        self.svm = SoftSVM(hidden_dim, num_classes)

    def forward(self, x):
        features = self.feature_extractor(x)
        output = self.svm(features)
        return output

class CustomDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def train_model(model, train_loader, test_loader, epochs=10, lr=0.001):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

        train_acc = 100 * correct / total
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Train Accuracy: {train_acc:.2f}%")

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

batch_size = 64
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

input_dim = X_train.shape[1]
num_classes = len(torch.unique(y_train))
model = LSTMSVM(input_dim=input_dim, hidden_dim=128, num_classes=num_classes)

train_model(model, train_loader, test_loader, epochs=10, lr=0.001)

  self.X = torch.tensor(X, dtype=torch.float32)
  self.y = torch.tensor(y, dtype=torch.long)


Epoch 1, Loss: 0.3709, Train Accuracy: 85.02%
Epoch 2, Loss: 0.2755, Train Accuracy: 89.68%
Epoch 3, Loss: 0.2411, Train Accuracy: 90.86%
Epoch 4, Loss: 0.2219, Train Accuracy: 91.54%
Epoch 5, Loss: 0.2078, Train Accuracy: 91.94%
Epoch 6, Loss: 0.1981, Train Accuracy: 92.45%
Epoch 7, Loss: 0.1900, Train Accuracy: 92.78%
Epoch 8, Loss: 0.1810, Train Accuracy: 92.95%
Epoch 9, Loss: 0.1754, Train Accuracy: 93.19%
Epoch 10, Loss: 0.1716, Train Accuracy: 93.34%
Test Accuracy: 92.16%


**LSTM + RF**

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
from sklearn.ensemble import RandomForestClassifier
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.5):
        super(LSTMFeatureExtractor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(1)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out.detach().cpu().numpy()

class CustomDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def extract_features(model, data_loader):
    model.to(device)
    model.eval()
    features, labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            batch_features = model(X_batch)
            features.append(batch_features)
            labels.append(y_batch.numpy())

    return np.vstack(features), np.concatenate(labels)

batch_size = 64
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

input_dim = X_train.shape[1]
lstm_model = LSTMFeatureExtractor(input_dim, hidden_size=128)

X_train_feats, y_train_feats = extract_features(lstm_model, train_loader)
X_test_feats, y_test_feats = extract_features(lstm_model, test_loader)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_feats, y_train_feats)

y_pred = rf_model.predict(X_test_feats)
accuracy = np.mean(y_pred == y_test_feats) * 100
print(f"Random Forest Test Accuracy: {accuracy:.2f}%")


  self.X = torch.tensor(X, dtype=torch.float32)
  self.y = torch.tensor(y, dtype=torch.long)


Random Forest Test Accuracy: 90.42%
