In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import pickle
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Ensure the VADER lexicon is downloaded
# nltk.download('vader_lexicon')

# Load Data
book_data = pd.read_csv("books_data.csv")
review_data = pd.read_csv("Books_rating.csv")

# Data Preprocessing
def calculate_helpfulness_ratio(value):
    """
    Safely calculate the helpfulness ratio, avoiding division by zero.
    """
    try:
        if '/' in value:
            numerator, denominator = map(int, value.split('/'))
            return numerator / denominator if denominator != 0 else 0
        else:
            return 0
    except Exception:
        return 0

# Apply the function to calculate helpfulness ratio
review_data['helpfulness_ratio'] = review_data['review/helpfulness'].apply(calculate_helpfulness_ratio)

# Handle Missing Values
review_data['review/text'] = review_data['review/text'].fillna('')
review_data['review/score'] = review_data['review/score'].fillna(review_data['review/score'].mean())

# Sentiment Analysis Feature
sia = SentimentIntensityAnalyzer()
review_data['sentiment'] = review_data['review/text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Additional Features
review_data['text_length'] = review_data['review/text'].apply(len)
review_data['word_count'] = review_data['review/text'].apply(lambda x: len(x.split()))

# Label Fake/Real
review_data['label'] = review_data['helpfulness_ratio'].apply(lambda x: 1 if x > 0.5 else 0)

# Feature Engineering
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
X_text = tfidf.fit_transform(review_data['review/text']).toarray()

# Combine Features
numerical_features = review_data[['review/score', 'helpfulness_ratio', 'sentiment', 'text_length', 'word_count']]
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(numerical_features)

X = np.hstack((X_text, scaled_features))
y = review_data['label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Model
model = grid_search.best_estimator_

# Evaluation
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save Model and Preprocessor
with open("review_model.pkl", "wb") as f:
    pickle.dump((model, tfidf, scaler), f)
