In [2]:
import re
import numpy as np
import joblib
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import contractions
import spacy

In [4]:
# Classifier
clf = joblib.load("../joblib/logistic_fake_review_model.pkl")

# TF-IDF + NN models
raw_tfidf = joblib.load("../joblib/raw_tfidf_vectorizer.pkl")
raw_nn = joblib.load("../joblib/raw_nn_model.pkl")

lemm_tfidf = joblib.load("../joblib/lemm_tfidf_vectorizer.pkl")
lemm_nn = joblib.load("../joblib/lemm_nn_model.pkl")

# Category centroids
category_centroids = joblib.load("../joblib/category_centroids.pkl")

# Numeric feature order
numeric_feature_order = joblib.load("../joblib/numeric_feature_order.pkl")

# NLP tools
sia = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_sm")

In [5]:
def capital_letter_ratio(text):
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    return sum(1 for c in letters if c.isupper()) / len(letters)

def punctuation_ratio(text):
    if not text:
        return 0.0
    return len(re.findall(r"[^\w\s]", text)) / len(text)

def repetition_score(text):
    words = re.findall(r'\b\w+\b', text.lower())
    if not words:
        return 0.0
    return 1 - len(set(words)) / len(words)

def expand_contractions(text):
    return contractions.fix(text) if isinstance(text, str) else ""

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()

def adjective_ratio(text):
    tokens = word_tokenize(text)
    if not tokens:
        return 0.0
    tags = pos_tag(tokens)
    return sum(1 for _, t in tags if t.startswith("JJ")) / len(tokens)

def sentiment_score(text):
    return sia.polarity_scores(text)["compound"] if text else 0.0

def rating_polarity(r):
    if r >= 4:
        return 1
    if r <= 2:
        return -1
    return 0

def rating_sentiment_mismatch(sentiment, rating):
    rp = rating_polarity(rating)
    if rp == 1 and sentiment < -0.2:
        return 1
    if rp == -1 and sentiment > 0.2:
        return 1
    return 0

def lemmatize(text):
    doc = nlp(text)
    return " ".join(t.lemma_ for t in doc if not t.is_space)

In [6]:
def raw_review_similarity(text):
    vec = raw_tfidf.transform([text])
    distances, _ = raw_nn.kneighbors(vec)
    sims = 1 - distances
    return sims[0, 1:].max()

def lemm_review_similarity(text):
    vec = lemm_tfidf.transform([text])
    distances, _ = lemm_nn.kneighbors(vec)
    sims = 1 - distances
    return sims[0, 1:].max()

def category_consistency(text, category):
    if category not in category_centroids:
        return 0.0
    vec = lemm_tfidf.transform([text])
    centroid = category_centroids[category]
    return cosine_similarity(vec, centroid)[0][0]

In [7]:
def predict_review(review, rating, category):
    # ---------- RAW ----------
    text_length = len(review)
    cap_ratio = capital_letter_ratio(review)
    punct_ratio = punctuation_ratio(review)
    rep_score = repetition_score(review)

    # ---------- CLEAN ----------
    expanded = expand_contractions(review)
    cleaned = clean_text(expanded)

    adj_ratio = adjective_ratio(cleaned)
    sent_score = sentiment_score(cleaned)
    mismatch = rating_sentiment_mismatch(sent_score, rating)
    extreme = 1 if rating in [1, 5] else 0

    # ---------- LEMMATIZED ----------
    lemm = lemmatize(cleaned)

    # ---------- STATEFUL ----------
    raw_sim = raw_review_similarity(review)
    lemm_sim = lemm_review_similarity(lemm)
    cat_score = category_consistency(lemm, category)

    # ---------- NUMERIC VECTOR ----------
    numeric_dict = {
        'text_length': text_length,
        'capital_ratio': cap_ratio,
        'punctuation_ratio': punct_ratio,
        'adjective_ratio': adj_ratio,
        'sentiment_score': sent_score,
        'rating_sentiment_mismatch': mismatch,
        'raw_review_similarity': raw_sim,
        'category_consistency_score': cat_score,
        'review_similarity_score': lemm_sim,
        'repetition_score': rep_score,
        'is_extreme_rating': extreme
    }

    numeric_values = np.array([[numeric_dict[f] for f in numeric_feature_order]])

    # ---------- FINAL MATRIX ----------
    X_text = lemm_tfidf.transform([lemm])
    X_final = hstack([X_text, numeric_values])

    # ---------- PREDICTION ----------
    pred = clf.predict(X_final)[0]
    prob = clf.predict_proba(X_final)[0, 1]

    return {
        "prediction": "Genuine" if pred == 1 else "Fake",
        "probability": float(prob)
    }

In [13]:
result = predict_review(
    review="The product is very bad and I am extremely disappointed with my purchase.",
    rating=5,
    category="Accessories"
)
print(result)


{'prediction': 'Genuine', 'probability': 0.6404654947180597}


In [10]:
import joblib
import pandas as pd

# Load trained model
lr = joblib.load("../joblib/logistic_fake_review_model.pkl")

# Load numeric feature names
numeric_features = joblib.load("../joblib/numeric_feature_order.pkl")

# Total number of features model was trained on
total_features = lr.coef_.shape[1]

# TF-IDF feature count
tfidf_dim = total_features - len(numeric_features)

# Create feature name list
all_features = (
    [f"tfidf_{i}" for i in range(tfidf_dim)] +
    numeric_features
)

# Create importance dataframe
coef_df = pd.DataFrame({
    "feature": all_features,
    "weight": lr.coef_[0]
}).sort_values(by="weight", ascending=False)

coef_df.head(20)



Unnamed: 0,feature,weight
40,tfidf_40,1.589861
5,tfidf_5,1.522377
9,tfidf_9,1.178359
44,tfidf_44,1.150399
34,tfidf_34,1.137174
28,tfidf_28,0.974208
21,tfidf_21,0.877822
24,tfidf_24,0.806413
49,tfidf_49,0.743794
60,punctuation_ratio,0.732988


In [11]:
coef_df[coef_df['feature'].isin(numeric_features)].sort_values(
    by="weight", ascending=False
)


Unnamed: 0,feature,weight
60,punctuation_ratio,0.732988
66,review_similarity_score,0.375728
61,adjective_ratio,0.244999
62,sentiment_score,0.230189
68,is_extreme_rating,0.021301
58,text_length,0.005975
63,rating_sentiment_mismatch,-0.478925
65,category_consistency_score,-0.712102
59,capital_ratio,-0.943445
67,repetition_score,-11.260963
