In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# ----------------------
# MLP Classifier with TF-IDF
# ----------------------
mlp_tfidf = MLPClassifier(
    hidden_layer_sizes=(100,),   # 1 hidden layer with 100 neurons
    activation='relu',
    alpha=1.5,                   # L2 regularization
    learning_rate='adaptive',
    early_stopping=True,
    max_iter=200,
    random_state=42
)

mlp_tfidf.fit(X_train_tfidf, y_train)

train_acc_tfidf = accuracy_score(y_train, mlp_tfidf.predict(X_train_tfidf))
test_acc_tfidf = accuracy_score(y_test, mlp_tfidf.predict(X_test_tfidf))

print("MLP Classifier with TF-IDF")
print("Training Accuracy:", round(train_acc_tfidf * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_tfidf * 100, 2), "%")

# ----------------------
# MLP Classifier with Count Vectorizer
# ----------------------
mlp_count = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='relu',
    alpha=1.5,
    learning_rate='adaptive',
    early_stopping=True,
    max_iter=200,
    random_state=42
)

mlp_count.fit(X_train_count, y_train)

train_acc_count = accuracy_score(y_train, mlp_count.predict(X_train_count))
test_acc_count = accuracy_score(y_test, mlp_count.predict(X_test_count))

print("\nMLP Classifier with Count Vectorizer")
print("Training Accuracy:", round(train_acc_count * 100, 2), "%")
print("Testing Accuracy:", round(test_acc_count * 100, 2), "%")


MLP Classifier with TF-IDF
Training Accuracy: 74.5 %
Testing Accuracy: 74.47 %

MLP Classifier with Count Vectorizer
Training Accuracy: 92.44 %
Testing Accuracy: 86.63 %


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Transform
X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_tfidf_res, y_train_tfidf_res = smote.fit_resample(X_train_tfidf, y_train)
X_train_count_res, y_train_count_res = smote.fit_resample(X_train_count, y_train)

# ----------------------
# MLP Classifier with TF-IDF + SMOTE
# ----------------------
mlp_tfidf = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='relu',
    alpha=1.5,
    learning_rate='adaptive',
    early_stopping=True,
    max_iter=200,
    random_state=42
)

mlp_tfidf.fit(X_train_tfidf_res, y_train_tfidf_res)

y_pred_train_tfidf = mlp_tfidf.predict(X_train_tfidf_res)
y_pred_test_tfidf = mlp_tfidf.predict(X_test_tfidf)

print("MLP Classifier with TF-IDF + SMOTE")
print("Training Accuracy:", round(accuracy_score(y_train_tfidf_res, y_pred_train_tfidf) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_test_tfidf) * 100, 2), "%")
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test_tfidf))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_test_tfidf))

# ----------------------
# MLP Classifier with Count Vectorizer + SMOTE
# ----------------------
mlp_count = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='relu',
    alpha=1.5,
    learning_rate='adaptive',
    early_stopping=True,
    max_iter=200,
    random_state=42
)

mlp_count.fit(X_train_count_res, y_train_count_res)

y_pred_train_count = mlp_count.predict(X_train_count_res)
y_pred_test_count = mlp_count.predict(X_test_count)

print("\nMLP Classifier with Count Vectorizer + SMOTE")
print("Training Accuracy:", round(accuracy_score(y_train_count_res, y_pred_train_count) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_test_count) * 100, 2), "%")
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test_count))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_test_count))



MLP Classifier with TF-IDF + SMOTE
Training Accuracy: 88.78 %
Testing Accuracy: 82.41 %
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.92      0.83      0.88      1482
           1       0.62      0.80      0.70       508

    accuracy                           0.82      1990
   macro avg       0.77      0.82      0.79      1990
weighted avg       0.85      0.82      0.83      1990

Confusion Matrix (Test):
 [[1235  247]
 [ 103  405]]

MLP Classifier with Count Vectorizer + SMOTE
Training Accuracy: 90.45 %
Testing Accuracy: 80.5 %
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.91      0.82      0.86      1482
           1       0.59      0.75      0.66       508

    accuracy                           0.81      1990
   macro avg       0.75      0.79      0.76      1990
weighted avg       0.83      0.81      0.81      1990

Confusion Matrix (Test):
 [[1222  260]
 [ 128

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import numpy as np

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)

# ----------------------
# Tuned MLP Classifier with TF-IDF + SMOTE
# ----------------------
mlp_tfidf = MLPClassifier(
    hidden_layer_sizes=(150, 50),
    activation='relu',
    alpha=1.0,
    learning_rate='adaptive',
    early_stopping=True,
    max_iter=400,
    random_state=42
)

mlp_tfidf.fit(X_train_res, y_train_res)

# Predict probabilities and apply threshold adjustment
y_proba_test = mlp_tfidf.predict_proba(X_test_tfidf)[:,1]
threshold = 0.4  # Lower threshold to increase recall of minority class
y_pred_test = (y_proba_test >= threshold).astype(int)

# Training predictions
y_pred_train = mlp_tfidf.predict(X_train_res)

# ----------------------
# Evaluation
# ----------------------
print("Tuned MLP Classifier with TF-IDF + SMOTE")
print("Training Accuracy:", round(accuracy_score(y_train_res, y_pred_train) * 100, 2), "%")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_test) * 100, 2), "%")
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_test))


Tuned MLP Classifier with TF-IDF + SMOTE
Training Accuracy: 98.2 %
Testing Accuracy: 88.24 %
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.96      0.88      0.92      1482
           1       0.72      0.89      0.79       508

    accuracy                           0.88      1990
   macro avg       0.84      0.88      0.86      1990
weighted avg       0.90      0.88      0.89      1990

Confusion Matrix (Test):
 [[1306  176]
 [  58  450]]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import numpy as np

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/Binned/ankasa.csv")

# Extract features and labels
texts = df['content'].astype(str)
labels = df['label']

# Vectorizers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

X_tfidf = tfidf_vectorizer.fit_transform(texts)
X_count = count_vectorizer.fit_transform(texts)

# Train-test split (80/20)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, labels, test_size=0.2, random_state=42, stratify=labels
)
X_train_count, X_test_count, _, _ = train_test_split(
    X_count, labels, test_size=0.2, random_state=42, stratify=labels
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_tfidf_res, y_train_tfidf_res = smote.fit_resample(X_train_tfidf, y_train)
X_train_count_res, y_train_count_res = smote.fit_resample(X_train_count, y_train)

# ----------------------
# TF-IDF MLP
# ----------------------
mlp_tfidf = MLPClassifier(
    hidden_layer_sizes=(150, 50),
    activation='relu',
    alpha=1.0,
    learning_rate='adaptive',
    early_stopping=True,
    max_iter=400,
    random_state=42
)
mlp_tfidf.fit(X_train_tfidf_res, y_train_tfidf_res)

# ----------------------
# Count Vectorizer MLP
# ----------------------
mlp_count = MLPClassifier(
    hidden_layer_sizes=(150, 50),
    activation='relu',
    alpha=1.0,
    learning_rate='adaptive',
    early_stopping=True,
    max_iter=400,
    random_state=42
)
mlp_count.fit(X_train_count_res, y_train_count_res)

# ----------------------
# Ensemble Prediction (Soft Voting)
# ----------------------
# Get predicted probabilities
proba_tfidf = mlp_tfidf.predict_proba(X_test_tfidf)[:,1]
proba_count = mlp_count.predict_proba(X_test_count)[:,1]

# Weighted average of probabilities (TF-IDF slightly higher weight)
ensemble_proba = (0.6 * proba_tfidf + 0.4 * proba_count)

# Threshold adjustment for minority class
threshold = 0.4
y_pred_ensemble = (ensemble_proba >= threshold).astype(int)

# ----------------------
# Evaluation
# ----------------------
print("Ensemble MLP (TF-IDF + Count Vectorizer + SMOTE)")
print("Testing Accuracy:", round(accuracy_score(y_test, y_pred_ensemble) * 100, 2), "%")
print("Classification Report (Test):\n", classification_report(y_test, y_pred_ensemble))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_pred_ensemble))


Ensemble MLP (TF-IDF + Count Vectorizer + SMOTE)
Testing Accuracy: 89.35 %
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.95      0.90      0.93      1482
           1       0.75      0.87      0.81       508

    accuracy                           0.89      1990
   macro avg       0.85      0.89      0.87      1990
weighted avg       0.90      0.89      0.90      1990

Confusion Matrix (Test):
 [[1337  145]
 [  67  441]]


In [None]:
import os
import joblib

# Define folder and file path
folder_path = "/content/drive/MyDrive/Colab Notebooks/Checkpoints/MLP"
save_path = os.path.join(folder_path, "Ensemble_MLP.pkl")

# Create folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the models and vectorizers
joblib.dump({
    'mlp_tfidf': mlp_tfidf,
    'mlp_count': mlp_count,
    'tfidf_vectorizer': tfidf_vectorizer,
    'count_vectorizer': count_vectorizer
}, save_path)

print(f"Ensemble MLP model and vectorizers saved to: {save_path}")


Ensemble MLP model and vectorizers saved to: /content/drive/MyDrive/Colab Notebooks/Checkpoints/MLP/Ensemble_MLP.pkl


In [1]:
!cd "Colab Notebooks"

/bin/bash: line 1: cd: Colab Notebooks: No such file or directory
