In [None]:
import pandas as pd
import re
import spacy
import nltk
import numpy as np
import joblib
import contractions

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack

In [None]:
# nltk download
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("vader_lexicon")

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

#### Loading Dataset


In [None]:
file_path = "../dataset/raw_dataset.csv"
df_r = pd.read_csv(file_path)

df_r = df_r.drop_duplicates(subset="review").reset_index(drop=True)
df_r["label"] = df_r["label"].map({"CG": 0, "OR": 1})

#### Pre Pre-processing


##### Behavioral Features

In [None]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    return sum(c.isupper() for c in letters) / len(letters)

df_r["capital_ratio"] = df_r["review"].apply(capital_letter_ratio)

In [None]:
def punctuation_ratio(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0.0
    return len(re.findall(r"[^\w\s]", text)) / len(text)

df_r["punctuation_ratio"] = df_r["review"].apply(punctuation_ratio)

In [None]:
df_r["text_length"] = df_r["review"].astype(str).apply(len)

In [None]:
def repetition_score(text):
    words = re.findall(r"\b\w+\b", str(text).lower())
    if not words:
        return 0
    return 1 - (len(set(words)) / len(words))

df_r["repetition_score"] = df_r["review"].apply(repetition_score)

In [None]:
# Raw TF-IDF Similarity
reviews = df_r["review"].fillna("").astype(str).tolist()

raw_tfidf = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    max_features=5000
)

X_raw = raw_tfidf.fit_transform(reviews)

raw_nn = NearestNeighbors(
    n_neighbors=5,
    metric="cosine",
    algorithm="brute"
).fit(X_raw)

distances, _ = raw_nn.kneighbors(X_raw)
similarities = 1 - distances
df_r["raw_review_similarity"] = [max(row[1:]) for row in similarities]

joblib.dump(raw_tfidf, "../joblib/raw_tfidf_vectorizer.pkl")
joblib.dump(raw_nn, "../joblib/raw_nn_model.pkl")
print("RAW TF-IDF and NN model saved successfully")

#### Pre-processing


In [None]:
# contraction expansion
def expand_contractions(text):
    if pd.isna(text):
        return ""
    return contractions.fix(text)

df_r['expanded_text'] = df_r['review'].apply(expand_contractions)

In [None]:
# clean text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()

df_r["clean_text"] = df_r["expanded_text"].apply(clean_text)

##### Linguistic Features

In [None]:
# adjective_ratio 
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    adj_count = sum(1 for _, tag in tags if tag.startswith("JJ"))
    return adj_count / len(tokens) if tokens else 0

df_r["adjective_ratio"] = df_r["clean_text"].apply(adjective_ratio)

In [None]:
# sentiment score
sia = SentimentIntensityAnalyzer()

def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    return sia.polarity_scores(text)["compound"]

df_r["sentiment_score"] = df_r["clean_text"].apply(sentiment_score)

##### Rating Based Features

In [None]:
# rating polarity
def rating_polarity(r):
    if r >= 4:
        return 1
    elif r <= 2:
        return -1
    return 0

df_r["rating_polarity"] = df_r["rating"].apply(rating_polarity)

In [None]:
# rating sentiment mismatch
def rating_sentiment_mismatch(row):
    if row["rating_polarity"] == 1 and row["sentiment_score"] < -0.2:
        return 1
    if row["rating_polarity"] == -1 and row["sentiment_score"] > 0.2:
        return 1
    return 0

df_r["rating_sentiment_mismatch"] = df_r.apply(
    rating_sentiment_mismatch, axis=1
)

In [None]:
# extreme rating
df_r["is_extreme_rating"] = df_r["rating"].apply(
    lambda x: 1 if x in [1, 5] else 0
)

In [None]:
# Lemmatization
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    doc = nlp(text)
    return " ".join(token.lemma_ for token in doc if not token.is_space)

df_r["lemmatized_text"] = df_r["clean_text"].apply(lemmatize_text)

In [None]:
# Lemmatized TF-IDF Similarity
lemm_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    stop_words="english"
)

X_lemm = lemm_tfidf.fit_transform(df_r["lemmatized_text"])

In [None]:
# category consistency using TF-IDF
category_centroids = {}

for cat in df_r["category"].unique():
    idx = df_r[df_r["category"] == cat].index

    centroid = np.asarray(X_lemm[idx].mean(axis=0))
    category_centroids[cat] = centroid

def category_consistency(text, category):
    if category not in category_centroids:
        return 0.0

    vec = lemm_tfidf.transform([text])
    centroid = category_centroids[category]

    return cosine_similarity(vec, centroid)[0][0]

df_r["category_consistency_score"] = df_r.apply(
    lambda row: category_consistency(
        row["lemmatized_text"],
        row["category"]
    ),
    axis=1
)

joblib.dump(category_centroids, "../joblib/category_centroids.pkl")
print("Category centroids saved (lemmatized TF-IDF space)")

In [None]:
def category_consistency(text, category):
    if category not in category_centroids:
        return 0.0
    vec = lemm_tfidf.transform([text])
    centroid = category_centroids[category]
    return cosine_similarity(vec, centroid)[0][0]

df_r["category_consistency_score"] = df_r.apply(
    lambda row: category_consistency(
        row["lemmatized_text"],
        row["category"]
    ),
    axis=1
)

In [None]:
lemm_nn = NearestNeighbors(
    n_neighbors=5,
    metric="cosine",
    algorithm="brute"
).fit(X_lemm)

In [None]:
distances, indices = lemm_nn.kneighbors(X_lemm)

# distances are cosine distances â†’ convert to similarity
similarities = 1 - distances

# ignore self-similarity (index 0)
df_r['review_similarity_score'] = similarities[:, 1:].max(axis=1)

In [None]:
# Save models
joblib.dump(lemm_tfidf, "../joblib/lemm_tfidf_vectorizer.pkl")
joblib.dump(lemm_nn, "../joblib/lemm_nn_model.pkl")

print("Lemmatized TF-IDF and NN model saved")

In [None]:
numeric_feature_names = [
    "text_length",
    "capital_ratio",
    "punctuation_ratio",
    "adjective_ratio",
    "sentiment_score",
    "rating_sentiment_mismatch",
    "raw_review_similarity",
    "review_similarity_score",
    "repetition_score",
    "is_extreme_rating",
]

#  Feature values (NUMPY ARRAY)
numeric_features = df_r[numeric_feature_names].values

X_final = hstack([X_lemm, numeric_features])
y_final = df_r["label"].values

In [None]:
#  Save feature order for inference
joblib.dump(X_final, "../joblib/X_final_features.pkl")
joblib.dump(y_final, "../joblib/y_final_labels.pkl")
joblib.dump(numeric_feature_names, "../joblib/numeric_feature_order.pkl")

print("X_final, y_final, numeric features  saved successfully.")

In [None]:
# Save Preprocessed CSV
pre_df = df_r[
    [
        "category",
        "rating",
        "label",
        "review",
        "capital_ratio",
        "punctuation_ratio",
        "text_length",
        "repetition_score",
        "raw_review_similarity",
        "clean_text",
        "adjective_ratio",
        "sentiment_score",
        "rating_sentiment_mismatch",
        "is_extreme_rating",
        "lemmatized_text",
        "review_similarity_score",
    ]
]

pre_df.to_csv("../dataset/preprocessed_dataset.csv", index=False)

print("Preprocessing completed successfully")