**PREPROCESSING**

In [1]:
import re
import numpy as np
import gensim.downloader as api
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

word2vec_model = api.load("word2vec-google-news-300")

import pandas as pd
file_path = 'Combined Data.csv'
df = pd.read_csv(file_path)
def preprocess_and_tokenize(text):
    if not isinstance(text, str):
        return []

    text = re.sub(r"[^A-Za-z0-9\s]", "", text.lower())
    tokens = simple_preprocess(text, deacc=True)
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tokens

df['tokens'] = df['statement'].apply(preprocess_and_tokenize)
def get_average_word2vec(tokens, model, vector_size=300):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

df['word2vec_vector'] = df['tokens'].apply(lambda x: get_average_word2vec(x, word2vec_model))


df['status'] = df['status'].apply(lambda x: 0 if x == 'Normal' else 1)

df['status'].value_counts()



Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
1,36692
0,16351


**ENSEMBLE TECHNIQUES**

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import torch
import numpy as np

X = np.stack(df['word2vec_vector'].values)
y = df['status'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

class TransformerFeatureExtractor(nn.Module):
    def __init__(self, input_size, d_model=128, num_heads=4, num_layers=2):
        super(TransformerFeatureExtractor, self).__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dim_feedforward=256, dropout=0.1, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = TransformerFeatureExtractor(input_size=X_train.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, _ in train_loader:
        inputs = inputs.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, outputs)
        loss.backward()
        optimizer.step()

model.eval()
train_features = []
train_labels = []
test_features = []
test_labels = []

with torch.no_grad():
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        features = model(inputs).cpu().numpy()
        train_features.extend(features)
        train_labels.extend(labels.numpy())

    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        features = model(inputs).cpu().numpy()
        test_features.extend(features)
        test_labels.extend(labels.numpy())

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(train_features, train_labels)
svm_preds = svm_classifier.predict(test_features)
svm_acc = accuracy_score(test_labels, svm_preds)
print(f"SVM Accuracy: {svm_acc:.2f}")

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(train_features, train_labels)
dt_preds = dt_classifier.predict(test_features)
dt_acc = accuracy_score(test_labels, dt_preds)
print(f"Decision Tree Accuracy: {dt_acc:.2f}")

rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(train_features, train_labels)
rf_preds = rf_classifier.predict(test_features)
rf_acc = accuracy_score(test_labels, rf_preds)
print(f"Random Forest Accuracy: {rf_acc:.2f}")


SVM Accuracy: 0.90
Decision Tree Accuracy: 0.83
Random Forest Accuracy: 0.90
