In [4]:
import pandas as pd
import re
import spacy
import nltk
import numpy as np
import joblib

from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

#### Loading Dataset


In [None]:

file_path = "../dataset/raw_dataset.csv"
df_r=pd.read_csv(file_path)

In [6]:
df_r = df_r.drop_duplicates(subset='review').reset_index(drop=True)

In [7]:
df_r['label'].value_counts()

In [8]:
df_r['label'] = df_r['label'].map({'CG': 0, 'OR': 1})

#### Pre Pre-processing


In [9]:
def capital_letter_ratio(text):
    if not isinstance(text, str):
        return 0.0

    letters = [c for c in text if c.isalpha()]
    if len(letters) == 0:
        return 0.0

    capital_letters = [c for c in letters if c.isupper()]
    return len(capital_letters) / len(letters)
df_r['capital_ratio'] = df_r['review'].apply(capital_letter_ratio)

In [10]:
def punctuation_ratio(review):
    if not isinstance(review, str) or len(review) == 0:
        return 0.0

    punct_count = len(re.findall(r"[^\w\s]", review))
    return punct_count / len(review)
df_r['punctuation_ratio'] = df_r['review'].apply(punctuation_ratio)

In [None]:
df_r['text_length'] = df_r['review'].apply(lambda x: len(str(x)))

In [None]:
# repetition score
def repetition_score(review):
    words = re.findall(r'\b\w+\b', review.lower())
    if len(words) == 0:
        return 0
    return 1 - (len(set(words)) / len(words))

df_r['repetition_score'] = df_r['review'].apply(repetition_score)

In [None]:
# Step 1: Prepare raw reviews
reviews = df_r['review'].fillna("").astype(str).tolist()

# Step 2: TF-IDF vectorization (raw text)
tfidf = TfidfVectorizer(
    analyzer='char_wb',  
    ngram_range=(3,5),    
    max_features=10000,   
)

X = tfidf.fit_transform(reviews)  

# Using cosine distance, sparse matrix compatible
nbrs = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute').fit(X)

# Compute nearest neighbors distances for each review
distances, indices = nbrs.kneighbors(X)

# Step 4: Convert to similarity score
raw_review_similarity_score = 1 - distances

# Take **max similarity among neighbors (excluding self)**
max_sim = [max(row[1:]) for row in raw_review_similarity_score] 

# Step 5: Add to dataframe
df_r['raw_review_similarity'] = max_sim

#### Pre-processing


In [14]:
# contraction expansion
import contractions
def expand_contractions(review):
    if pd.isna(review):
        return ""
    return contractions.fix(review)
df_r['expanded_text'] = df_r['review'].apply(expand_contractions)

In [None]:
# cleaning text - lowercase, remove url, html tags, punctiation, whitespaces
def clean_text(review):
    if pd.isna(review):
        return ""
    
    review = review.lower()
    
    review = re.sub(r'http\S+|www\S+', '', review)
    review = re.sub(r'<.*?>', '', review)
    
    # remove punctuation (letters + spaces only)
    review = re.sub(r'[^a-z\s]', '', review)    
    review = re.sub(r'\s+', ' ', review).strip()
    
    return review
df_r['clean_text'] = df_r['expanded_text'].apply(clean_text)

In [None]:
#adjective ratio
def adjective_ratio(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    
    tokens = word_tokenize(text)        
    pos_tags = pos_tag(tokens)          
    
    adj_count = sum(1 for word, tag in pos_tags if tag.startswith('JJ'))
    total_words = len(tokens)
    
    return adj_count / total_words if total_words > 0 else 0
df_r['adjective_ratio'] = df_r['clean_text'].apply(adjective_ratio)

In [None]:
# sentiment score
sia = SentimentIntensityAnalyzer()
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0
    
    # Compound score ranges from -1 (very negative) to +1 (very positive)
    return sia.polarity_scores(text)['compound']
df_r['sentiment_score'] = df_r['clean_text'].apply(sentiment_score)

In [None]:
# rating polarity
def rating_polarity(r):
    if r >= 4:
        return 1
    elif r <= 2:
        return -1
    else:
        return 0

df_r['rating_polarity'] = df_r['rating'].apply(rating_polarity)

In [None]:
# rating-sentiment mismatch
def rating_sentiment_mismatch(row):
    sentiment = row['sentiment_score']
    rating_pol = row['rating_polarity']

    if rating_pol == 1 and sentiment < -0.2:
        return 1
    if rating_pol == -1 and sentiment > 0.2:
        return 1
    return 0

df_r['rating_sentiment_mismatch'] = df_r.apply(
    rating_sentiment_mismatch, axis=1
)

In [None]:
# extreme rating
df_r['is_extreme_rating'] = df_r['rating'].apply(
    lambda x: 1 if x in [1, 5] else 0
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    min_df=5
)

X = tfidf.fit_transform(df_r['clean_text'])

In [None]:
# category centroids
import numpy as np

category_centroids = {}

for cat in df_r['category'].unique():
    idx = df_r[df_r['category'] == cat].index
    category_centroids[cat] = np.asarray(X[idx].mean(axis=0))

In [23]:
# category consistency using TF-IDF
from sklearn.metrics.pairwise import cosine_similarity

def category_consistency_tfidf(i, category):
    if category not in category_centroids:
        return 0.0

    review_vec = X[i]
    centroid_vec = category_centroids[category]

    return cosine_similarity(review_vec, centroid_vec)[0][0]

In [24]:
df_r['category_consistency_score'] = [
    category_consistency_tfidf(i, cat)
    for i, cat in enumerate(df_r['category'])
]

In [25]:
#  lemmatization

# nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

In [34]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(review):
    if pd.isna(review) or review == "":
        return ""
    
    doc = nlp(review)
    
    lemmatized_words = [
        token.lemma_
        for token in doc
        if not token.is_space
    ]
    
    return " ".join(lemmatized_words)
df_r['lemmatized_text'] = df_r['clean_text'].apply(lemmatize_text)

In [None]:
# TF-IDF on lemmatized text
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),       
    max_features=5000,     
    min_df=0.05,
    max_df=0.9,
    stop_words='english'
)
X_tfidf = tfidf.fit_transform(df_r['lemmatized_text'])


In [29]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(
    n_neighbors=5,      
    metric='cosine',
    algorithm='brute'
)

nn.fit(X_tfidf)

In [30]:
distances, indices = nn.kneighbors(X_tfidf)

# distances are cosine distances â†’ convert to similarity
similarities = 1 - distances

# ignore self-similarity (index 0)
df_r['review_similarity_score'] = similarities[:, 1:].max(axis=1)


In [31]:
from scipy.sparse import hstack
# Select numeric / behavioral features including the new ones
numeric_features = df_r[
    [
        'text_length',
        'capital_ratio',
        'punctuation_ratio',
        'adjective_ratio',
        'sentiment_score',
        'rating_sentiment_mismatch',
        'raw_review_similarity',
        'category_consistency_score',
        'review_similarity_score',
        'repetition_score',        
        'is_extreme_rating'       
    ]
].values  # convert to dense numpy array

# Combine TF-IDF vectors with numeric features
X_final = hstack([X_tfidf, numeric_features])


In [32]:
df_r.columns

In [35]:
pre_df = df_r[
    ['category', 'rating', 'label', 'review', 'capital_ratio',
    'punctuation_ratio', 'text_length', 'repetition_score',
    'raw_review_similarity', 'expanded_text', 'clean_text',
    'adjective_ratio', 'sentiment_score', 'rating_polarity',
    'rating_sentiment_mismatch', 'is_extreme_rating',
    'category_consistency_score', 'lemmatized_text',
    'review_similarity_score']
]

# Save as CSV
pre_df.to_csv("../dataset/preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as CSV!")

In [None]:
y_final = df_r['label'].values

In [None]:
import joblib
import os

joblib.dump(X_final, "../joblib/X_final_features.pkl")
joblib.dump(y_final, "../joblib/y_final_labels.pkl")

print("X_final and y_final saved successfully in ../joblib/")
