In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import joblib
import matplotlib.pyplot as plt

Data Loading.....

In [None]:
df = pd.read_csv('feature_data.csv')
print("Columns in feature_data.csv:")
df.columns

In [None]:
df['Fake_Review'] = df['Rating'].apply(lambda x: 1 if x <= 2 else 0)
X = df.drop(columns=['Reviewer Name', 'Fake_Review', 'Rating'])
y = df['Fake_Review']

spliting data for training and testing data....

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

taking 4 model for evalution......

In [None]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

In [None]:
model_accuracies = {}
best_model = None
best_accuracy = 0.0
best_model_name = ''

training and evaluting model and selecting best model with highest accuracy....

In [None]:
for model_name, model in models.items():
    print(f"\nRunning model: {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate model
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    model_accuracies[model_name] = accuracy
    
    # selecting best model out of 4.....
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = model_name
    
    # Confusion matrix....
    # ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap='Blues')
    # plt.title(f'Confusion Matrix - {model_name}')
    # plt.show()

    # Cross-validation score....
    # cv_scores = cross_val_score(model, X, y, cv=5)
    # print(f'Cross-Validation Scores for {model_name}: {cv_scores}')
    # print(f'Mean CV Score: {cv_scores.mean():.4f}')
    print('------------------------------------------------------------------------------------------------------------------------------------------------------------')

In [None]:
if best_model:
    print(f"\nThe best model is {best_model_name} with accuracy: {best_accuracy:.4f}")
    joblib.dump(best_model, f'best_pro_rev_ana_model_{best_model_name}.pkl')
    print(f"Best model '{best_model_name}' saved for future use.")