# 🛡️ Amazon Review Fraud Detection
Comprehensive EDA, feature engineering, and unsupervised anomaly detection to identify potentially fake reviews.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('../data/suspicious_reviews.csv')
df['reviews.text'] = df['reviews.text'].astype(str)


In [None]:
# Clean text
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['cleaned_text'] = df['reviews.text'].apply(clean_text)
df['sentiment_polarity'] = df['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['sentiment_subjectivity'] = df['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df['exclamation_count'] = df['reviews.text'].apply(lambda x: x.count('!'))
df['uppercase_word_count'] = df['reviews.text'].apply(lambda x: sum(1 for word in x.split() if word.isupper()))
df['review_length'] = df['reviews.text'].apply(lambda x: len(x.split()))
df['review_char_length'] = df['reviews.text'].apply(len)


In [None]:
# Behavioral features
if 'reviews.username' in df.columns:
    df['user_review_count'] = df['reviews.username'].map(df['reviews.username'].value_counts())
else:
    df['user_review_count'] = 1  # default to 1 if column missing

df['normalized_rating'] = df['reviews.rating'] / 5
df['scaled_sentiment'] = (df['sentiment_polarity'] + 1) / 2
df['sentiment_rating_mismatch'] = abs(df['normalized_rating'] - df['scaled_sentiment'])


In [None]:
# Anomaly detection
features = [
    'review_length', 'review_char_length', 'sentiment_polarity',
    'sentiment_subjectivity', 'exclamation_count', 'uppercase_word_count',
    'user_review_count', 'sentiment_rating_mismatch'
]
X = df[features].fillna(0)
X_scaled = StandardScaler().fit_transform(X)

model = IsolationForest(contamination=0.05, random_state=42)
model.fit(X_scaled)
df['anomaly_score_value'] = model.decision_function(X_scaled)
df['anomaly_score'] = model.predict(X_scaled)  # -1 = anomaly, 1 = normal


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Use fallback usernames if missing
if 'reviews.username' not in df.columns or df['reviews.username'].isnull().all():
    df['reviews.username'] = ['user_' + str(i) for i in range(len(df))]

# Anomaly score vs rating (continuous)
plt.figure(figsize=(8, 5))
sns.boxplot(x='reviews.rating', y='anomaly_score_value', data=df, palette="coolwarm")
plt.title("📉 Anomaly Score vs. Review Rating")
plt.xlabel("Review Rating")
plt.ylabel("Trust Score (Higher = More Normal)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Sentiment vs Rating
plt.figure(figsize=(8, 5))
sns.scatterplot(x='reviews.rating', y='sentiment_polarity', hue='anomaly_score', palette={1: 'green', -1: 'red'}, data=df, alpha=0.6)
plt.title("🎯 Sentiment Polarity vs Rating (Anomalies in Red)")
plt.xlabel("Review Rating")
plt.ylabel("Sentiment Polarity")
plt.legend(title="anomaly_score", labels=["Fake (-1)", "Real (1)"])
plt.grid(True)
plt.tight_layout()
plt.show()

# Top suspicious users
suspicious = df[df['anomaly_score'] == -1]
if 'reviews.username' in suspicious.columns and not suspicious['reviews.username'].isnull().all():
    top_users = suspicious['reviews.username'].dropna().value_counts().head(10)
    if not top_users.empty:
        plt.figure(figsize=(8, 4))
        sns.barplot(x=top_users.values, y=top_users.index, palette='Reds_r')
        plt.title("🚩 Top Suspicious Reviewers")
        plt.xlabel("Number of Suspicious Reviews")
        plt.tight_layout()
        plt.show()
    else:
        print("No suspicious users found with valid usernames.")
else:
    print("Column 'reviews.username' is missing or empty in suspicious reviews.")
