In [35]:
import pandas as pd
import re
import spacy
import nltk
import joblib

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

#### Loading Dataset


In [36]:
file_path = "../dataset/raw_dataset.csv"
df_r=pd.read_csv(file_path)

In [37]:
df_r = df_r.drop_duplicates(subset='review').reset_index(drop=True)

In [38]:
df_r['label'].value_counts()

label
OR    20215
CG    20197
Name: count, dtype: int64

In [39]:
df_r['label'] = df_r['label'].map({'CG': 0, 'OR': 1})

#### Pre Pre-processing


##### Behavioral Features

In [40]:
# capital letter ratio
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)

df_r['capital_ratio'] = df_r['review'].apply(capital_letter_ratio)

In [41]:
#punctuation_ratio
def punctuation_ratio(review):
    if not isinstance(review, str) or len(review) == 0:
        return 0.0

    punct_count = len(re.findall(r"[^\w\s]", review))
    return punct_count / len(review)
df_r['punctuation_ratio'] = df_r['review'].apply(punctuation_ratio)

In [42]:
df_r["text_length"] = df_r["review"].astype(str).apply(len)

In [43]:
# repetition score
def repetition_score(review):
    words = re.findall(r'\b\w+\b', review.lower())
    if len(words) == 0:
        return 0
    return 1 - (len(set(words)) / len(words))

df_r['repetition_score'] = df_r['review'].apply(repetition_score)

In [44]:
# Step 1: Prepare raw reviews
reviews = df_r['review'].fillna("").astype(str).tolist()

# Step 2: TF-IDF vectorization (raw text)
raw_tfidf = TfidfVectorizer(
    analyzer='char_wb',  
    ngram_range=(3,5),    
    max_features=5000,   
)

X_raw = raw_tfidf.fit_transform(reviews)  

# Using cosine distance, sparse matrix compatible
raw_nn = NearestNeighbors(
    n_neighbors=5, 
    metric='cosine', 
    algorithm='brute').fit(X_raw)

# Compute nearest neighbors distances for each review
distances, indices = raw_nn.kneighbors(X_raw)

# Step 4: Convert to similarity score
raw_review_similarity_score = 1 - distances

# Take **max similarity among neighbors (excluding self)**
max_sim = [max(row[1:]) for row in raw_review_similarity_score] 

# Step 5: Add to dataframe
df_r['raw_review_similarity'] = max_sim

In [45]:
# Save models
joblib.dump(raw_tfidf, "../joblib/raw_tfidf_vectorizer.pkl")
joblib.dump(raw_nn, "../joblib/raw_nn_model.pkl")

print("RAW TF-IDF and NN model saved successfully")

RAW TF-IDF and NN model saved successfully


#### Pre-processing


In [46]:
# contraction expansion
import contractions
def expand_contractions(review):
    if pd.isna(review):
        return ""
    return contractions.fix(review)
df_r['expanded_text'] = df_r['review'].apply(expand_contractions)

In [47]:
# cleaning text - lowercase, remove url, html tags, punctiation, whitespaces
def clean_text(review):
    if pd.isna(review):
        return ""
    
    review = review.lower()
    
    review = re.sub(r'http\S+|www\S+', '', review)
    review = re.sub(r'<.*?>', '', review)
    
    # remove punctuation (letters + spaces only)
    review = re.sub(r'[^a-z\s]', '', review)    
    review = re.sub(r'\s+', ' ', review).strip()
    
    return review

df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

##### Linguistic Features

In [48]:
#adjective ratio
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    
    tokens = word_tokenize(text)        
    pos_tags = pos_tag(tokens)          
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    total_words = len(tokens)
    
    return adj_count / total_words if total_words > 0 else 0
df_r['adjective_ratio'] = df_r['clean_text'].apply(adjective_ratio)

In [49]:
# sentiment score
sia = SentimentIntensityAnalyzer()

def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    return sia.polarity_scores(text)["compound"]

df_r["sentiment_score"] = df_r["clean_text"].apply(sentiment_score)

##### Rating Based Features

In [50]:
# rating polarity
def rating_polarity(r):
    if r >= 4:
        return 1
    elif r <= 2:
        return -1
    return 0

df_r["rating_polarity"] = df_r["rating"].apply(rating_polarity)

In [51]:
# rating-sentiment mismatch
def rating_sentiment_mismatch(row):
    sentiment = row['sentiment_score']
    rating_pol = row['rating_polarity']

    if rating_pol == 1 and sentiment < -0.2:
        return 1
    if rating_pol == -1 and sentiment > 0.2:
        return 1
    return 0

df_r['rating_sentiment_mismatch'] = df_r.apply(
    rating_sentiment_mismatch, axis=1
)

In [52]:
# extreme rating
df_r['is_extreme_rating'] = df_r['rating'].apply(
    lambda x: 1 if x in [1, 5] else 0
)

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

clean_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=0.05,
    max_df=0.9,
    stop_words='english'
)

X_clean = clean_tfidf.fit_transform(df_r['clean_text'])

In [54]:
# category centroids
import numpy as np

category_centroids = {}

for cat in df_r['category'].unique():
    idx = df_r[df_r['category'] == cat].index
    category_centroids[cat] = np.asarray(X_clean[idx].mean(axis=0))

In [55]:
# category consistency using TF-IDF
from sklearn.metrics.pairwise import cosine_similarity

def category_consistency_tfidf(i, category):
    if category not in category_centroids:
        return 0.0

    review_vec = X_clean[i]
    centroid_vec = category_centroids[category]

    return cosine_similarity(review_vec, centroid_vec)[0][0]

In [56]:
df_r['category_consistency_score'] = [
    category_consistency_tfidf(i, cat)
    for i, cat in enumerate(df_r['category'])
]

joblib.dump(category_centroids, "../joblib/category_centroids.pkl")

print("Category centroids saved")

Category centroids saved


In [57]:
#  lemmatization

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data.

True

In [58]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(review):
    if pd.isna(review) or review == "":
        return ""
    
    doc = nlp(review)
    
    lemmatized_words = [
        token.lemma_
        for token in doc
        if not token.is_space
    ]
    
    return " ".join(lemmatized_words)

df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [59]:
# TF-IDF on lemmatized text
lemm_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),       
    max_features=5000,     
    min_df=0.05,
    max_df=0.9,
    stop_words='english'
)
X_lemm = lemm_tfidf.fit_transform(df_r['lemmatized_text'])


In [60]:
lemm_nn = NearestNeighbors(
    n_neighbors=5,
    metric="cosine",
    algorithm="brute"
).fit(X_lemm)

In [61]:
distances, indices = lemm_nn.kneighbors(X_lemm)

# distances are cosine distances â†’ convert to similarity
similarities = 1 - distances

# ignore self-similarity (index 0)
df_r['review_similarity_score'] = similarities[:, 1:].max(axis=1)

In [62]:
# Save models
joblib.dump(lemm_tfidf, "../joblib/lemm_tfidf_vectorizer.pkl")
joblib.dump(lemm_nn, "../joblib/lemm_nn_model.pkl")

print("Lemmatized TF-IDF and NN model saved")

Lemmatized TF-IDF and NN model saved


In [63]:
from scipy.sparse import hstack

# Feature names (LIST)
numeric_feature_names = [
    'text_length',
    'capital_ratio',
    'punctuation_ratio',
    'adjective_ratio',
    'sentiment_score',
    'rating_sentiment_mismatch',
    'raw_review_similarity',
    'category_consistency_score',
    'review_similarity_score',
    'repetition_score',        
    'is_extreme_rating'
]

# Feature values (NUMPY ARRAY)
numeric_features = (
    df_r[numeric_feature_names]
    .apply(pd.to_numeric, errors="coerce")
    .fillna(0)
    .values
    .astype(np.float64)
)
# Combine with TF-IDF
X_final = hstack([X_lemm, numeric_features])

KeyError: "None of [Index(['text_length', 'capital_ratio', 'punctuation_ratio', 'adjective_ratio',\n       'sentiment_score', 'rating_sentiment_mismatch', 'raw_review_similarity',\n       'category_consistency_score', 'review_similarity_score',\n       'repetition_score', 'is_extreme_rating'],\n      dtype='object')] are in the [columns]"

In [64]:
# Save feature order for inference
joblib.dump(
    numeric_feature_names,
    "../joblib/numeric_feature_order.pkl"
)

['../joblib/numeric_feature_order.pkl']

In [65]:
df_r.columns

Index(['category', 'rating', 'label', 'review', 'capital_ratio',
       'punctuation_ratio', 'text_length', 'repetition_score',
       'raw_review_similarity', 'expanded_text', 'clean_text',
       'adjective_ratio', 'sentiment_score', 'rating_polarity',
       'rating_sentiment_mismatch', 'is_extreme_rating',
       'category_consistency_score', 'review_similarity_score'],
      dtype='object')

In [66]:
pre_df = df_r[
    ['category', 'rating', 'label', 'review', 'capital_ratio',
    'punctuation_ratio', 'text_length', 'repetition_score',
    'raw_review_similarity', 'expanded_text', 'clean_text',
    'adjective_ratio', 'sentiment_score', 'rating_polarity',
    'rating_sentiment_mismatch', 'is_extreme_rating',
    'category_consistency_score', 'lemmatized_text',
    'review_similarity_score']
]

print("X_final, y_final, numeric features  saved successfully.")

X_final, y_final, numeric features  saved successfully.


In [67]:
y_final = df_r['label'].values

In [68]:
joblib.dump(X_final, "../joblib/X_final_features.pkl")
joblib.dump(y_final, "../joblib/y_final_labels.pkl")

print("X_final and y_final saved successfully in ../joblib/")

X_final and y_final saved successfully in ../joblib/


In [69]:
import joblib
import os

pre_df.to_csv("../dataset/preprocessed_dataset.csv", index=False)

print("Preprocessing completed successfully")

Preprocessing completed successfully


In [36]:
y_final = df_r['label'].values

In [37]:
import joblib
import os

joblib.dump(X_final, "../joblib/X_final_features.pkl")
joblib.dump(y_final, "../joblib/y_final_labels.pkl")

print("X_final and y_final saved successfully in ../joblib/")


X_final and y_final saved successfully in ../joblib/
