In [2]:
!pip install -q transformers datasets xgboost scikit-learn tensorflow


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional, Input
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
import seaborn as sns
import matplotlib.pyplot as plt
import gc


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd
file_path = '/content/drive/My Drive/thesis/thesis 2.csv'
df = pd.read_csv(file_path)
df = df[['Text', 'Label']]
df.dropna(inplace=True)
print("Dataset shape:", df.shape)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, max_len=128):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="tf"
    )

In [None]:
def get_bert_embeddings(model, texts, batch_size=32, max_len=128):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        encoded = encode_texts(batch_texts, max_len)
        batch_output = model(encoded['input_ids'], attention_mask=encoded['attention_mask'])
        batch_embeddings = batch_output.last_hidden_state[:, 0, :].numpy()  # CLS token
        embeddings.append(batch_embeddings)
        del encoded, batch_output
        gc.collect()
    return np.vstack(embeddings)


In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
X = get_bert_embeddings(bert_model, df['Text'].values, batch_size=32)
y = df['Label'].values

In [None]:
del bert_model
gc.collect()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))


In [None]:
# Model 1: CNN + BERT + XGBoost
input_layer = Input(shape=(X_train.shape[1], 1))
x = Conv1D(128, kernel_size=3, activation='relu')(input_layer)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
cnn_model = Model(inputs=input_layer, outputs=x)

In [None]:
# Prepare CNN input
X_train_cnn = np.expand_dims(X_train, axis=2)
X_test_cnn = np.expand_dims(X_test, axis=2)

cnn_features_train = cnn_model.predict(X_train_cnn, batch_size=32)
cnn_features_test = cnn_model.predict(X_test_cnn, batch_size=32)

In [None]:
# Hyperparameter tuning for XGBoost
xgb_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
grid_search = GridSearchCV(xgb_model, xgb_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(cnn_features_train, y_train, sample_weight=[class_weight_dict[label] for label in y_train])

In [None]:
# Best XGBoost model
xgb_model_cnn = grid_search.best_estimator_
print("Best CNN + XGBoost Params:", grid_search.best_params_)
y_pred_cnn = xgb_model_cnn.predict(cnn_features_test)

In [None]:
# Save CNN models
xgb_model_cnn.save_model('/content/drive/My Drive/thesis/cnn_xgboost_model.json')
cnn_model.save('/content/drive/My Drive/thesis/cnn_model.h5')

In [None]:
# Model 2: BiLSTM + BERT + XGBoost
input_layer = Input(shape=(X_train.shape[1], 1))
x = Bidirectional(LSTM(128, return_sequences=False))(input_layer)
x = Dense(128, activation='relu')(x)
lstm_model = Model(inputs=input_layer, outputs=x)

lstm_features_train = lstm_model.predict(X_train_cnn, batch_size=32)
lstm_features_test = lstm_model.predict(X_test_cnn, batch_size=32)


In [None]:
# Hyperparameter tuning for XGBoost
xgb_model_lstm = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
grid_search_lstm = GridSearchCV(xgb_model_lstm, xgb_params, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_lstm.fit(lstm_features_train, y_train, sample_weight=[class_weight_dict[label] for label in y_train])

In [None]:
# Best XGBoost model
xgb_model_lstm = grid_search_lstm.best_estimator_
print("Best BiLSTM + XGBoost Params:", grid_search_lstm.best_params_)
y_pred_lstm = xgb_model_lstm.predict(lstm_features_test)

In [None]:
# Save BiLSTM models
xgb_model_lstm.save_model('/content/drive/My Drive/thesis/lstm_xgboost_model.json')
lstm_model.save('/content/drive/My Drive/thesis/lstm_model.h5')

In [None]:
# Evaluation and visualization function
def evaluate_and_plot(y_true, y_pred, model_name, classes):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    print(f"\nðŸ“Š Evaluation for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:\n", cm)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

    return accuracy, precision, recall, f1

In [None]:
# Evaluate both models
classes = np.unique(y)
cnn_metrics = evaluate_and_plot(y_test, y_pred_cnn, "CNN + BERT + XGBoost", classes)
lstm_metrics = evaluate_and_plot(y_test, y_pred_lstm, "BiLSTM + BERT + XGBoost", classes)

In [None]:
# Plot metric comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
plt.figure(figsize=(10, 6))
x = np.arange(len(metrics))
width = 0.35

plt.bar(x - width/2, cnn_metrics, width, label='CNN + BERT + XGBoost', color='#1f77b4')
plt.bar(x + width/2, lstm_metrics, width, label='BiLSTM + BERT + XGBoost', color='#ff7f0e')
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Model Performance Comparison')
plt.xticks(x, metrics)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Save results to CSV
results = pd.DataFrame({
    'Metric': metrics,
    'CNN + BERT + XGBoost': cnn_metrics,
    'BiLSTM + BERT + XGBoost': lstm_metrics
})
results.to_csv('/content/drive/My Drive/thesis/model_results.csv', index=False)
print("Results saved to /content/drive/My Drive/thesis/model_results.csv")