## Building the Stack Ensemble

Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import warnings, string, nltk, joblib, time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Defining text processing functions

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return " ".join(lemmatized_words)

def minimal_preprocess(text):
    return text.lower()

def whitespace_split(text):
    return text.split()

Loading the Dataset

In [None]:
df = pd.read_csv("/content/fake reviews dataset.csv")
df = df.dropna(subset=["text_"])
df = df[~df["text_"].str.contains("http", na=False)]
df["processed_text"] = df["text_"]


df_sample = df
X = df_sample["processed_text"]
y = df_sample["label"]

# Dataset splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.35, random_state=77, stratify=y
)

# Vectorization
vectorizer = CountVectorizer(max_features=5000, analyzer=whitespace_split)

tfidf_transformer = TfidfTransformer()

X_train_counts = vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

import numpy as np
import pandas as pd
import warnings, string, nltk, joblib, time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import StackingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

Defining and testing the models individually

In [None]:
lr_clf = LogisticRegression(solver="liblinear", random_state=42, penalty='l2', C=4)
nb_clf = MultinomialNB()

In [None]:
# Train Logistic Regression directly on vectorized features
lr_clf.fit(X_train_tfidf, y_train)
lr_preds = lr_clf.predict(X_test_tfidf)

# Evaluate Logistic Regression
print("=== Logistic Regression Classification Report ===")
print(classification_report(y_test, lr_preds))
print("Logistic Regression Accuracy: {:.2%}".format(accuracy_score(y_test, lr_preds)))


=== Logistic Regression Classification Report ===
              precision    recall  f1-score   support

          CG       0.92      0.92      0.92      7060
          OR       0.92      0.92      0.92      7054

    accuracy                           0.92     14114
   macro avg       0.92      0.92      0.92     14114
weighted avg       0.92      0.92      0.92     14114

Logistic Regression Accuracy: 91.86%


In [None]:
# Train Naive Bayes directly on vectorized features
nb_clf.fit(X_train_tfidf, y_train)
nb_preds = nb_clf.predict(X_test_tfidf)

# Evaluate Naive Bayes
print("=== Naive Bayes Classification Report ===")
print(classification_report(y_test, nb_preds))
print("Naive Bayes Accuracy: {:.2%}".format(accuracy_score(y_test, nb_preds)))


=== Naive Bayes Classification Report ===
              precision    recall  f1-score   support

          CG       0.88      0.91      0.89      7060
          OR       0.90      0.87      0.89      7054

    accuracy                           0.89     14114
   macro avg       0.89      0.89      0.89     14114
weighted avg       0.89      0.89      0.89     14114

Naive Bayes Accuracy: 89.06%


In [None]:
lr_clf = LogisticRegression(solver="liblinear", random_state=42, penalty='l2', C=4)
nb_clf = MultinomialNB()

estimators = [
    ("LogReg", lr_clf),
    ("NB", nb_clf)
] # Base estimators

# =============================================================================
# Create the Stacking Classifier
# =============================================================================
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(solver="liblinear", random_state=42),
    cv=25,
    n_jobs=-1,
    passthrough=True
)

# =============================================================================
# Train the Stacking Model
# =============================================================================
stacking_model.fit(X_train_tfidf, y_train)

# =============================================================================
# Evaluation
# =============================================================================
y_pred = stacking_model.predict(X_test_tfidf)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
print("Accuracy: {:.2%}".format(accuracy_score(y_test, y_pred)))


=== Classification Report ===
              precision    recall  f1-score   support

          CG       0.92      0.92      0.92      7060
          OR       0.92      0.92      0.92      7054

    accuracy                           0.92     14114
   macro avg       0.92      0.92      0.92     14114
weighted avg       0.92      0.92      0.92     14114

=== Confusion Matrix ===
[[6526  534]
 [ 573 6481]]
Accuracy: 92.16%


Saving the Stack Model

In [None]:
from google.colab import drive
import shutil

drive.mount('/content/drive')

destination_folder = "/content/drive/MyDrive/Fake_Review_Model/"

!mkdir -p "$destination_folder"

model_filename = "stack_2_models_optimized.pkl"
joblib.dump((vectorizer, tfidf_transformer, stacking_model), model_filename)

shutil.copy(model_filename, destination_folder)

print(f"Model saved to Google Drive at: {destination_folder}{model_filename}")


Mounted at /content/drive
âœ… Model saved to Google Drive at: /content/drive/MyDrive/Fake_Review_Model/stack_2_models_optimized.pkl
