In [85]:
import re
import numpy as np
import joblib
import pandas as pd
import contractions
import spacy

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack


In [86]:
# Classifier
clf = joblib.load("../joblib/logistic_fake_review_model.pkl")

# TF-IDF + NN models
raw_tfidf = joblib.load("../joblib/raw_tfidf_vectorizer.pkl")
raw_nn = joblib.load("../joblib/raw_nn_model.pkl")

lemm_tfidf = joblib.load("../joblib/lemm_tfidf_vectorizer.pkl")
lemm_nn = joblib.load("../joblib/lemm_nn_model.pkl")

# Category centroids
category_centroids = joblib.load("../joblib/category_centroids.pkl")

# Numeric feature order
numeric_feature_order = joblib.load("../joblib/numeric_feature_order.pkl")

scaler = joblib.load("../joblib/numeric_scaler.pkl")


# NLP tools
sia = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_sm")

In [87]:
def capital_letter_ratio(text):
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    return sum(1 for c in letters if c.isupper()) / len(letters)

def punctuation_ratio(text):
    return len(re.findall(r"[^\w\s]", text)) / len(text) if text else 0.0

def repetition_score(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return 1 - (len(set(words)) / len(words)) if words else 0.0

def expand_contractions(text):
    return contractions.fix(text)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()

def adjective_ratio(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    adj = sum(1 for _, t in tags if t.startswith("JJ"))
    return adj / len(tokens) if tokens else 0.0

def sentiment_score(text):
    return sia.polarity_scores(text)["compound"]

def sentiment_confidence(text):
    scores = sia.polarity_scores(text)
    return max(scores["pos"], scores["neg"])

def rating_sentiment_mismatch(sentiment, rating):
    if rating >= 4 and sentiment < -0.2:
        return 1
    if rating <= 2 and sentiment > 0.2:
        return 1
    return 0

def rating_sentiment_mismatch_score(rating, sentiment):
    rating_norm = (rating - 3) / 2
    return max(0, -rating_norm * sentiment) * abs(rating_norm)

def lemmatize(text):
    return " ".join([t.lemma_ for t in nlp(text) if not t.is_space])

def raw_review_similarity(text):
    vec = raw_tfidf.transform([text])
    dist, _ = raw_nn.kneighbors(vec)
    return 1 - dist[0][1:].max()

def lemm_review_similarity(text):
    vec = lemm_tfidf.transform([text])
    dist, _ = lemm_nn.kneighbors(vec)
    return 1 - dist[0][1:].max()

def category_consistency(text, category):
    if category not in category_centroids:
        return 0.0
    vec = lemm_tfidf.transform([text])
    centroid = category_centroids[category]
    return cosine_similarity(vec, centroid)[0][0]


In [88]:
def predict_review(review, rating, category):

    # RAW
    text_length = len(review)
    cap_ratio = capital_letter_ratio(review)
    punct_ratio = punctuation_ratio(review)
    rep_score = repetition_score(review)

    # CLEAN
    expanded = expand_contractions(review)
    cleaned = clean_text(expanded)

    adj_ratio = adjective_ratio(cleaned)
    sent_score = sentiment_score(cleaned)
    sent_conf = sentiment_confidence(cleaned)

    mismatch = rating_sentiment_mismatch(sent_score, rating)
    mismatch_score = rating_sentiment_mismatch_score(rating, sent_score)
    weighted_mismatch = mismatch_score * sent_conf
    extreme = 1 if rating in [1, 5] else 0

    # LEMMATIZED
    lemm = lemmatize(cleaned)


    # SIMILARITY
    raw_sim = raw_review_similarity(review)
    lemm_sim = lemm_review_similarity(lemm)
    cat_score = category_consistency(lemm, category)

    # NUMERIC VECTOR (ORDER MATTERS)
    numeric_dict = {
        'text_length': text_length,
        'capital_ratio': cap_ratio,
        'punctuation_ratio': punct_ratio,
        'adjective_ratio': adj_ratio,
        'sentiment_score': sent_score,
        'sentiment_confidence': sent_conf,
        'rating_sentiment_mismatch': mismatch,
        'rating_sentiment_mismatch_score': mismatch_score,
        'weighted_rating_sentiment_mismatch': weighted_mismatch,
        'raw_review_similarity': raw_sim,
        'category_consistency_score': cat_score,
        'review_similarity_score': lemm_sim,
        'repetition_score': rep_score,
        'is_extreme_rating': extreme
    }

    numeric_values = np.array([[numeric_dict[f] for f in numeric_feature_order]])
    numeric_scaled = scaler.transform(numeric_values)

    X_text = lemm_tfidf.transform([lemm])
    X_final = hstack([X_text, numeric_scaled])

    pred = clf.predict(X_final)[0]
    prob = clf.predict_proba(X_final)[0][1]

    return {
        "prediction": "Genuine" if pred == 1 else "Fake",
        "probability": round(float(prob), 3)
    }

In [None]:
print(numeric_feature_order)

['text_length', 'capital_ratio', 'punctuation_ratio', 'adjective_ratio', 'sentiment_score', 'sentiment_confidence', 'rating_sentiment_mismatch', 'rating_sentiment_mismatch_score', 'weighted_rating_sentiment_mismatch', 'raw_review_similarity', 'category_consistency_score', 'review_similarity_score', 'repetition_score', 'is_extreme_rating']


In [91]:
predict_review(
    review="Amazing dress! Fits perfectly and the fabric is soft.",
    rating=5,
    category="Clothing"
)

{'prediction': 'Genuine', 'probability': 0.927}

In [None]:
predict_review(
    review="This book is very informative and well written. I learned a lot.",
    rating=4,
    category="Books"
)

{'prediction': 'Genuine', 'probability': 0.93}

In [None]:
predict_review(
    review="The phone overheats quickly and the battery drains too fast.",
    rating=2,
    category="Electronics"
)

{'prediction': 'Genuine', 'probability': 0.765}

In [None]:
predict_review(
    review="Shoes looked nice but the sole started coming off after a week.",
    rating=1,
    category="Footwear"
)

{'prediction': 'Genuine', 'probability': 0.984}

In [None]:
predict_review(
    review="This is the best product ever! Everyone must buy it!",
    rating=5,
    category="Accessories"
)

{'prediction': 'Genuine', 'probability': 0.958}

In [None]:
predict_review(
    review="Amazing product, amazing quality, amazing experience!",
    rating=5,
    category="Beauty"
)

{'prediction': 'Fake', 'probability': 0.01}

In [None]:
predict_review(
    review="Highly recommend this to everyone. Very very good.",
    rating=5,
    category="Toys"
)

{'prediction': 'Fake', 'probability': 0.401}

In [None]:
predict_review(
    review="The product is very bad and I am extremely disappointed.",
    rating=5,
    category="Accessories"
)

{'prediction': 'Fake', 'probability': 0.061}

In [None]:
predict_review(
    review="This item broke quickly and feels cheap.",
    rating=5,
    category="Home Appliances"
)

{'prediction': 'Genuine', 'probability': 0.885}

In [None]:
predict_review(
    review="Good good good product product product.",
    rating=5,
    category="Clothing"
)

{'prediction': 'Fake', 'probability': 0.0}

In [None]:
predict_review(
    review="Nice nice nice nice nice.",
    rating=4,
    category="Beauty"
)

{'prediction': 'Fake', 'probability': 0.0}

In [None]:
predict_review(
    review="Good product.",
    rating=4,
    category="Books"
)

{'prediction': 'Fake', 'probability': 0.386}

In [None]:
predict_review(
    review="Bad quality.",
    rating=1,
    category="Accessories"
)

{'prediction': 'Fake', 'probability': 0.376}

In [None]:
predict_review(
    review="The sound quality is crisp and bass is strong.",
    rating=5,
    category="Electronics"
)

{'prediction': 'Genuine', 'probability': 0.573}

In [None]:
predict_review(
    review="The fabric is soft and stitching is neat.",
    rating=4,
    category="Clothing"
)

{'prediction': 'Genuine', 'probability': 0.68}

In [None]:
predict_review(
    review="I absolutely hate this product. Worst purchase ever.",
    rating=1,
    category="Accessories"
)

{'prediction': 'Genuine', 'probability': 0.924}

In [109]:
predict_review(
    review="Terrible experience. Completely useless item.",
    rating=1,
    category="Home Appliances"
)


{'prediction': 'Genuine', 'probability': 0.99}