In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import pickle



KeyError: 'Label'

In [10]:
# Load Data
books_reviews = pd.read_csv("Books_rating.csv")
books_details = pd.read_csv("books_data.csv")

# Merge Data on Title
data = pd.merge(books_reviews, books_details, on="Title", how="inner")

# Feature Engineering
# 1. Extract helpfulness ratio
# Safely parse and calculate helpfulness ratio
def calculate_helpfulness_ratio(value):
    try:
        # Ensure the value contains a fraction-like format (e.g., "2/3")
        if isinstance(value, str) and '/' in value:
            numerator, denominator = map(int, value.split('/'))
            return numerator / max(denominator, 1)  # Avoid division by zero
        else:
            return 0  # Assign 0 if the format is invalid
    except Exception:
        return 0  # Fallback for any unexpected errors

# Apply the function to the column
data['helpfulness_ratio'] = data['review/helpfulness'].apply(calculate_helpfulness_ratio)


# 2. Word Count in Review Text
data['word_count'] = data['review/text'].fillna("").apply(lambda x: len(x.split()))

# 3. Sentiment Score (example placeholder)
data['sentiment_score'] = data['review/text'].fillna("").apply(lambda x: len([word for word in x.split() if word.lower() in ['good', 'excellent', 'bad', 'poor']]))

# 4. Score Deviation
data['score_deviation'] = abs(data['review/score'] - data['ratingsCount'].mean())

# Dropping irrelevant columns
data = data.drop(['Id', 'profileName', 'review/time', 'review/summary', 'image', 'previewLink', 'infoLink'], axis=1)

# Target Label (Assuming the dataset has a "Label" column for real/fake reviews)
data['Label'] = data['Label'].apply(lambda x: 1 if x == 'real' else 0)

# Split Data
X = data[['helpfulness_ratio', 'word_count', 'sentiment_score', 'score_deviation']]
y = data['Label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Vectorization (for review/text if used)
tfidf = TfidfVectorizer(max_features=1000)
X_text = tfidf.fit_transform(data['review/text'].fillna("")).toarray()

# Combine Text Features with Numeric Features
X_train = np.hstack([X_train, X_text[:X_train.shape[0]]])
X_val = np.hstack([X_val, X_text[X_train.shape[0]:]])


KeyError: 'Label'

In [12]:


# Load Data
books_reviews = pd.read_csv("Books_rating.csv")
books_details = pd.read_csv("books_data.csv")

# Merge Data on Title
data = pd.merge(books_reviews, books_details, on="Title", how="inner")

# Preprocessing and Feature Engineering
# 1. Extract helpfulness ratio
def calculate_helpfulness_ratio(value):
    try:
        if isinstance(value, str) and '/' in value:
            numerator, denominator = map(int, value.split('/'))
            return numerator / max(denominator, 1)  # Avoid division by zero
        else:
            return 0
    except:
        return 0

data['helpfulness_ratio'] = data['review/helpfulness'].apply(calculate_helpfulness_ratio)

# 2. Word Count in Review Text
data['word_count'] = data['review/text'].fillna("").apply(lambda x: len(x.split()))

# 3. Sentiment Score (example placeholder)
data['sentiment_score'] = data['review/text'].fillna("").apply(lambda x: len([word for word in x.split() if word.lower() in ['good', 'excellent', 'bad', 'poor']]))

# 4. Score Deviation
data['score_deviation'] = abs(data['review/score'] - data['ratingsCount'].mean())

# Dropping irrelevant columns
data = data.drop(['Id', 'profileName', 'review/time', 'review/summary', 'image', 'previewLink', 'infoLink'], axis=1, errors='ignore')

# Simulate 'Label' column (replace this with real logic if available)
np.random.seed(42)  # For reproducibility
data['Label'] = np.random.choice([0, 1], size=len(data), p=[0.5, 0.5])  # 50% real, 50% fake

# Split Data into Features and Target
X = data[['helpfulness_ratio', 'word_count', 'sentiment_score', 'score_deviation']]
y = data['Label']

# Split into Train and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Vectorization (for 'review/text')
tfidf = TfidfVectorizer(max_features=1000)
X_text_train = tfidf.fit_transform(data['review/text'].fillna("")).toarray()
X_text_val = tfidf.transform(data['review/text'].fillna("")).toarray()

# Combine Text Features with Numeric Features
X_train_combined = np.hstack([X_train, X_text_train[:X_train.shape[0]]])
X_val_combined = np.hstack([X_val, X_text_val[:X_val.shape[0]]])

# Model Training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_combined, y_train)

# Validation
y_pred = clf.predict(X_val_combined)
print("Classification Report:")
print(classification_report(y_val, y_pred))
print(f"Accuracy: {accuracy_score(y_val, y_pred):.2f}")

# Save Model and TFIDF Vectorizer
with open("review_checker_model.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# Function to Predict New Data
def predict_review(new_data):
    # Load Model and Vectorizer
    with open("review_checker_model.pkl", "rb") as f:
        model = pickle.load(f)
    with open("tfidf_vectorizer.pkl", "rb") as f:
        tfidf = pickle.load(f)
    
    # Extract Features
    new_data['helpfulness_ratio'] = new_data['review/helpfulness'].apply(calculate_helpfulness_ratio)
    new_data['word_count'] = new_data['review/text'].fillna("").apply(lambda x: len(x.split()))
    new_data['sentiment_score'] = new_data['review/text'].fillna("").apply(lambda x: len([word for word in x.split() if word.lower() in ['good', 'excellent', 'bad', 'poor']]))
    new_data['score_deviation'] = abs(new_data['review/score'] - data['ratingsCount'].mean())
    
    # Text Vectorization
    text_features = tfidf.transform(new_data['review/text'].fillna("")).toarray()
    
    # Combine Features
    numeric_features = new_data[['helpfulness_ratio', 'word_count', 'sentiment_score', 'score_deviation']].values
    features = np.hstack([numeric_features, text_features])
    
    # Predict
    predictions = model.predict(features)
    return ["Real" if pred == 1 else "Fake" for pred in predictions]

# Test Prediction
new_test_data = pd.DataFrame({
    "review/helpfulness": ["2/3"],
    "review/text": ["This is a great book! Highly recommended."],
    "review/score": [5],
    "ratingsCount": [500]
})

predictions = predict_review(new_test_data)
print(predictions)


KeyboardInterrupt: 

In [14]:

# Step 1: Load Data
print("Loading data...")
books_reviews = pd.read_csv("Books_rating.csv")
books_details = pd.read_csv("books_data.csv")
print("Data loaded successfully.")

# Step 2: Merge Data on Title
print("Merging datasets on Title...")
data = pd.merge(books_reviews, books_details, on="Title", how="inner")
print(f"Merged dataset contains {len(data)} rows and {len(data.columns)} columns.")

# Step 3: Preprocessing and Feature Engineering
print("Starting feature engineering...")

# 3.1 Extract helpfulness ratio
def calculate_helpfulness_ratio(value):
    try:
        if isinstance(value, str) and '/' in value:
            numerator, denominator = map(int, value.split('/'))
            return numerator / max(denominator, 1)  # Avoid division by zero
        else:
            return 0
    except:
        return 0

data['helpfulness_ratio'] = data['review/helpfulness'].apply(calculate_helpfulness_ratio)
print("Calculated helpfulness ratio.")

# 3.2 Word Count in Review Text
data['word_count'] = data['review/text'].fillna("").apply(lambda x: len(x.split()))
print("Calculated word count for reviews.")

# 3.3 Sentiment Score
data['sentiment_score'] = data['review/text'].fillna("").apply(
    lambda x: len([word for word in x.split() if word.lower() in ['good', 'excellent', 'bad', 'poor']])
)
print("Calculated sentiment score.")

# 3.4 Score Deviation
data['score_deviation'] = abs(data['review/score'] - data['ratingsCount'].mean())
print("Calculated score deviation.")

# Dropping irrelevant columns
columns_to_drop = ['Id', 'profileName', 'review/time', 'review/summary', 'image', 'previewLink', 'infoLink']
data = data.drop(columns=columns_to_drop, axis=1, errors='ignore')
print("Dropped irrelevant columns.")

# Step 4: Simulate 'Label' Column
print("Creating simulated 'Label' column...")
np.random.seed(42)  # For reproducibility
data['Label'] = np.random.choice([0, 1], size=len(data), p=[0.5, 0.5])  # 50% real, 50% fake
print("Simulated 'Label' column created.")

# Step 5: Split Data into Features and Target
print("Splitting data into features and target...")
X = data[['helpfulness_ratio', 'word_count', 'sentiment_score', 'score_deviation']]
y = data['Label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and validation sets.")

# Step 6: Text Vectorization
print("Performing TF-IDF vectorization on review text...")
tfidf = TfidfVectorizer(max_features=1000)
X_text_train = tfidf.fit_transform(data['review/text'].fillna("")).toarray()
X_text_val = tfidf.transform(data['review/text'].fillna("")).toarray()
print("TF-IDF vectorization completed.")

# Combine Text Features with Numeric Features
print("Combining text features with numeric features...")
X_train_combined = np.hstack([X_train, X_text_train[:X_train.shape[0]]])
X_val_combined = np.hstack([X_val, X_text_val[:X_val.shape[0]]])
print("Features combined successfully.")

# Step 7: Model Training
print("Training Random Forest Classifier...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_combined, y_train)
print("Model training completed.")

# Step 8: Validation
print("Validating the model...")
y_pred = clf.predict(X_val_combined)
print("Validation completed. Results:")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print(f"Accuracy: {accuracy_score(y_val, y_pred):.2f}")

# Step 9: Save Model and Vectorizer
print("Saving the model and TF-IDF vectorizer...")
with open("review_checker_model.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
print("Model and vectorizer saved successfully.")

# Step 10: Predict New Data
def predict_review(new_data):
    print("Loading model and vectorizer for prediction...")
    with open("review_checker_model.pkl", "rb") as f:
        model = pickle.load(f)
    with open("tfidf_vectorizer.pkl", "rb") as f:
        tfidf = pickle.load(f)
    print("Model and vectorizer loaded.")

    # Extract Features
    print("Extracting features from new data...")
    new_data['helpfulness_ratio'] = new_data['review/helpfulness'].apply(calculate_helpfulness_ratio)
    new_data['word_count'] = new_data['review/text'].fillna("").apply(lambda x: len(x.split()))
    new_data['sentiment_score'] = new_data['review/text'].fillna("").apply(
        lambda x: len([word for word in x.split() if word.lower() in ['good', 'excellent', 'bad', 'poor']])
    )
    new_data['score_deviation'] = abs(new_data['review/score'] - data['ratingsCount'].mean())
    
    # Text Vectorization
    text_features = tfidf.transform(new_data['review/text'].fillna("")).toarray()
    
    # Combine Features
    numeric_features = new_data[['helpfulness_ratio', 'word_count', 'sentiment_score', 'score_deviation']].values
    features = np.hstack([numeric_features, text_features])
    print("Features extracted.")

    # Predict
    print("Making predictions...")
    predictions = model.predict(features)
    return ["Real" if pred == 1 else "Fake" for pred in predictions]

# Test Prediction
print("Testing prediction on new data...")
new_test_data = pd.DataFrame({
    "review/helpfulness": ["2/3"],
    "review/text": ["This is a great book! Highly recommended."],
    "review/score": [5],
    "ratingsCount": [500]
})
predictions = predict_review(new_test_data)
print("Prediction result:", predictions)


Loading data...
Data loaded successfully.
Merging datasets on Title...
Merged dataset contains 3000000 rows and 19 columns.
Starting feature engineering...
Calculated helpfulness ratio.
Calculated word count for reviews.
Calculated sentiment score.
Calculated score deviation.
Dropped irrelevant columns.
Creating simulated 'Label' column...
Simulated 'Label' column created.
Splitting data into features and target...
Data split into training and validation sets.
Performing TF-IDF vectorization on review text...


MemoryError: Unable to allocate 22.4 GiB for an array with shape (3000000, 1000) and data type float64

In [None]:


# Step 1: Load Data
print("Loading data...")
books_reviews = pd.read_csv("Books_rating.csv")
books_details = pd.read_csv("books_data.csv")
print("Data loaded successfully.")

# Step 2: Merge Data on Title
print("Merging datasets on Title...")
data = pd.merge(books_reviews, books_details, on="Title", how="inner")
print(f"Merged dataset contains {len(data)} rows and {len(data.columns)} columns.")

# Step 3: Preprocessing and Feature Engineering
print("Starting feature engineering...")

# 3.1 Extract helpfulness ratio
def calculate_helpfulness_ratio(value):
    try:
        if isinstance(value, str) and '/' in value:
            numerator, denominator = map(int, value.split('/'))
            return numerator / max(denominator, 1)  # Avoid division by zero
        else:
            return 0
    except:
        return 0

data['helpfulness_ratio'] = data['review/helpfulness'].apply(calculate_helpfulness_ratio)
print("Calculated helpfulness ratio.")

# 3.2 Word Count in Review Text
data['word_count'] = data['review/text'].fillna("").apply(lambda x: len(x.split()))
print("Calculated word count for reviews.")

# 3.3 Sentiment Score
data['sentiment_score'] = data['review/text'].fillna("").apply(
    lambda x: len([word for word in x.split() if word.lower() in ['good', 'excellent', 'bad', 'poor']])
)
print("Calculated sentiment score.")

# 3.4 Score Deviation
data['score_deviation'] = abs(data['review/score'] - data['ratingsCount'].mean())
print("Calculated score deviation.")

# Dropping irrelevant columns
columns_to_drop = ['Id', 'profileName', 'review/time', 'review/summary', 'image', 'previewLink', 'infoLink']
data = data.drop(columns=columns_to_drop, axis=1, errors='ignore')
print("Dropped irrelevant columns.")

# Step 4: Simulate 'Label' Column
print("Creating simulated 'Label' column...")
np.random.seed(42)  # For reproducibility
data['Label'] = np.random.choice([0, 1], size=len(data), p=[0.5, 0.5])  # 50% real, 50% fake
print("Simulated 'Label' column created.")

# Step 5: Split Data into Features and Target
print("Splitting data into features and target...")
X_numeric = data[['helpfulness_ratio', 'word_count', 'sentiment_score', 'score_deviation']]
y = data['Label']
X_numeric_train, X_numeric_val, y_train, y_val = train_test_split(X_numeric, y, test_size=0.2, random_state=42)
print("Data split into training and validation sets.")

# Step 6: Text Vectorization
print("Performing TF-IDF vectorization on review text...")
tfidf = TfidfVectorizer(max_features=1000)
X_text_sparse = tfidf.fit_transform(data['review/text'].fillna(""))
X_text_train = X_text_sparse[:X_numeric_train.shape[0]]
X_text_val = X_text_sparse[X_numeric_train.shape[0]:]
print("TF-IDF vectorization completed.")

# Combine Text Features with Numeric Features (Sparse Matrices)
from scipy.sparse import hstack
print("Combining text features with numeric features...")
X_train_combined = hstack([X_numeric_train, X_text_train])
X_val_combined = hstack([X_numeric_val, X_text_val])
print("Features combined successfully.")

# Step 7: Model Training
print("Training Random Forest Classifier...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_combined, y_train)
print("Model training completed.")

# Step 8: Validation
print("Validating the model...")
y_pred = clf.predict(X_val_combined)
print("Validation completed. Results:")
print("Classification Report:")
print(classification_report(y_val, y_pred))
print(f"Accuracy: {accuracy_score(y_val, y_pred):.2f}")

# Step 9: Save Model and Vectorizer
print("Saving the model and TF-IDF vectorizer...")
with open("review_checker_model.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
print("Model and vectorizer saved successfully.")

# Step 10: Predict New Data
def predict_review(new_data):
    print("Loading model and vectorizer for prediction...")
    with open("review_checker_model.pkl", "rb") as f:
        model = pickle.load(f)
    with open("tfidf_vectorizer.pkl", "rb") as f:
        tfidf = pickle.load(f)
    print("Model and vectorizer loaded.")

    # Extract Features
    print("Extracting features from new data...")
    new_data['helpfulness_ratio'] = new_data['review/helpfulness'].apply(calculate_helpfulness_ratio)
    new_data['word_count'] = new_data['review/text'].fillna("").apply(lambda x: len(x.split()))
    new_data['sentiment_score'] = new_data['review/text'].fillna("").apply(
        lambda x: len([word for word in x.split() if word.lower() in ['good', 'excellent', 'bad', 'poor']])
    )
    new_data['score_deviation'] = abs(new_data['review/score'] - data['ratingsCount'].mean())
    
    # Text Vectorization
    text_features = tfidf.transform(new_data['review/text'].fillna(""))
    
    # Combine Features (Sparse Matrices)
    numeric_features = new_data[['helpfulness_ratio', 'word_count', 'sentiment_score', 'score_deviation']].values
    numeric_sparse = hstack([numeric_features])  # Convert to sparse
    features = hstack([numeric_sparse, text_features])
    print("Features extracted.")

    # Predict
    print("Making predictions...")
    predictions = model.predict(features)
    return ["Real" if pred == 1 else "Fake" for pred in predictions]

# Test Prediction
print("Testing prediction on new data...")
new_test_data = pd.DataFrame({
    "review/helpfulness": ["2/3"],
    "review/text": ["This is a great book! Highly recommended."],
    "review/score": [5],
    "ratingsCount": [500]
})
predictions = predict_review(new_test_data)
print("Prediction result:", predictions)






Loading data...
Data loaded successfully.
Merging datasets on Title...
Merged dataset contains 3000000 rows and 19 columns.
Starting feature engineering...
Calculated helpfulness ratio.
Calculated word count for reviews.
Calculated sentiment score.
Calculated score deviation.
Dropped irrelevant columns.
Creating simulated 'Label' column...
Simulated 'Label' column created.
Splitting data into features and target...
Data split into training and validation sets.
Performing TF-IDF vectorization on review text...
TF-IDF vectorization completed.
Combining text features with numeric features...
Features combined successfully.
Training Random Forest Classifier...
