In [None]:
import pandas as pd
import numpy as np
import re
import contractions
import joblib
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack


In [None]:
#  Load all required models
classifier = joblib.load("../joblib/logistic_fake_review_model.pkl")
tfidf_lemm = joblib.load("../joblib/lemm_tfidf_vectorizer.pkl")
tfidf_raw = joblib.load("../joblib/raw_tfidf_vectorizer.pkl")
nn_raw = joblib.load("../joblib/raw_nn_model.pkl")
nn_lemm = joblib.load("../joblib/lemm_nn_model.pkl")
category_centroids = joblib.load("../joblib/category_centroids.pkl")

sia = SentimentIntensityAnalyzer()

In [None]:
# Helper functions

def capital_letter_ratio(text):
    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0: return 0
    capitals = [c for c in letters if c.isupper()]
    return len(capitals) / len(letters)

def punctuation_ratio(text):
    if not text: return 0
    return len(re.findall(r"[^\w\s]", text)) / len(text)

def repetition_score(text):
    words = re.findall(r'\b\w+\b', str(text).lower())
    if not words: return 0
    return 1 - len(set(words)) / len(words)

def expand_text(text):
    if pd.isna(text): return ""
    return contractions.fix(text)

def clean_text(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def adjective_ratio(text):
    if not text.strip(): return 0
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    adj_count = sum(1 for w, t in tags if t.startswith('JJ'))
    return adj_count / len(tokens) if tokens else 0

def sentiment_score(text):
    if not text.strip(): return 0
    return sia.polarity_scores(text)['compound']

def rating_polarity(r):
    if r >= 4: return 1
    elif r <= 2: return -1
    return 0

def rating_sentiment_mismatch(sentiment, rating):
    rp = rating_polarity(rating)
    if rp == 1 and sentiment < -0.2: return 1
    if rp == -1 and sentiment > 0.2: return 1
    return 0

def is_extreme_rating(r):
    return 1 if r in [1, 5] else 0



In [None]:

def raw_review_similarity(review_text):
    vec = tfidf_raw.transform([review_text])
    distances, indices = nn_raw.kneighbors(vec)
    sims = 1 - distances
    return sims[0][1:].max()  # max excluding self

def clean_review_similarity(review_text):
    vec = tfidf_lemm.transform([review_text])
    distances, indices = nn_lemm.kneighbors(vec)
    sims = 1 - distances
    return sims[0][1:].max()

def category_consistency_score(cleaned_text, category):
    if category not in category_centroids:
        return 0.0
    vec = tfidf_lemm.transform([cleaned_text])
    centroid = category_centroids[category]
    return cosine_similarity(vec, centroid)[0][0]

In [None]:
def preprocess_and_predict(review_text, rating, category):

    # --- Stateless features ---
    text_length = len(review_text)
    capital_ratio = capital_letter_ratio(review_text)
    punct_ratio = punctuation_ratio(review_text)
    rep_score = repetition_score(review_text)
    extreme_rating = 1 if rating in [1, 5] else 0

    exp_text = expand_text(review_text)
    cleaned_text = clean_text(exp_text)
    adj_ratio = adjective_ratio(cleaned_text)
    sent_score = sentiment_score(cleaned_text)

    rating_pol = 1 if rating >= 4 else -1 if rating <= 2 else 0
    mismatch = int(
        (rating_pol == 1 and sent_score < -0.2) or
        (rating_pol == -1 and sent_score > 0.2)
    )

    # --- Raw similarity ---
    raw_vec = tfidf_raw.transform([review_text])
    raw_dist, _ = nn_raw.kneighbors(raw_vec)
    raw_sim = (1 - raw_dist[:, 1:]).max()

    # --- Clean similarity ---
    clean_vec = tfidf_lemm.transform([cleaned_text])
    clean_dist, _ = nn_lemm.kneighbors(clean_vec)
    review_sim = (1 - clean_dist[:, 1:]).max()

    # --- Category consistency ---
    centroid = category_centroids.get(category)
    cat_score = cosine_similarity(clean_vec, centroid)[0][0] if centroid is not None else 0

    # --- Numeric features (ORDER MUST MATCH TRAINING) ---
    numeric_features = np.array([[
        text_length,
        capital_ratio,
        punct_ratio,
        adj_ratio,
        sent_score,
        mismatch,
        raw_sim,
        cat_score,
        review_sim,
        rep_score,
        extreme_rating
    ]])

    # --- FINAL FEATURE VECTOR ---
    X_final = hstack([clean_vec, numeric_features])

    # --- Sanity check ---
    assert X_final.shape[1] == classifier.n_features_in_

    # --- Predict ---
    prob = classifier.predict_proba(X_final)[0][1]
    label = "FAKE" if prob >= 0.5 else "GENUINE"

    return label, prob