In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import joblib
import matplotlib.pyplot as plt

Data Loading.....

In [3]:
df = pd.read_csv('feature_data.csv')
print("Columns in feature_data.csv:")
df.columns

Columns in feature_data.csv:


Index(['Reviewer Name', 'Rating', '00', '000', '00pm', '01', '02', '03', '04',
       '05',
       ...
       'yourselves', 'youtube', 'yr', 'yrs', 'zero', 'zone', '𝐒𝐮𝐩𝐩𝐨𝐫𝐭_',
       '𝒻𝓇𝑒𝑒', '𝕙𝕖𝕝𝕡', '𝕟𝕦𝕞𝕓𝕖𝕣'],
      dtype='object', length=5002)

In [4]:
df['Fake_Review'] = df['Rating'].apply(lambda x: 1 if x <= 2 else 0)
X = df.drop(columns=['Reviewer Name', 'Fake_Review', 'Rating'])
y = df['Fake_Review']

spliting data for training and testing data....

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

Training samples: 16844, Testing samples: 4211


taking 4 model for evalution......

In [6]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

In [7]:
model_accuracies = {}
best_model = None
best_accuracy = 0.0
best_model_name = ''

training and evaluting model and selecting best model with highest accuracy....

In [8]:
for model_name, model in models.items():
    print(f"\nRunning model: {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate model
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    model_accuracies[model_name] = accuracy
    
    # selecting best model out of 4.....
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = model_name
    
    # Confusion matrix....
    # ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap='Blues')
    # plt.title(f'Confusion Matrix - {model_name}')
    # plt.show()

    # Cross-validation score....
    # cv_scores = cross_val_score(model, X, y, cv=5)
    # print(f'Cross-Validation Scores for {model_name}: {cv_scores}')
    # print(f'Mean CV Score: {cv_scores.mean():.4f}')
    print('------------------------------------------------------------------------------------------------------------------------------------------------------------')


Running model: RandomForest
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.72      0.79      1290
           1       0.88      0.96      0.92      2921

    accuracy                           0.89      4211
   macro avg       0.89      0.84      0.86      4211
weighted avg       0.89      0.89      0.88      4211

Confusion Matrix:
[[ 923  367]
 [ 116 2805]]
Accuracy: 0.8853
------------------------------------------------------------------------------------------------------------------------------------------------------------

Running model: LogisticRegression
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.82      0.78      1290
           1       0.92      0.88      0.90      2921

    accuracy                           0.86      4211
   macro avg       0.84      0.85      0.84      4211
weighted avg       0.87      0.86      0.86      4211

Confusion Matrix:

In [9]:
if best_model:
    print(f"\nThe best model is {best_model_name} with accuracy: {best_accuracy:.4f}")
    joblib.dump(best_model, f'best_pro_rev_ana_model_{best_model_name}.pkl')
    print(f"Best model '{best_model_name}' saved for future use.")


The best model is GradientBoosting with accuracy: 0.8910
Best model 'GradientBoosting' saved for future use.
