In [13]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# For raw review tf-idf and nearest neighbors model

# Load raw dataset
df_r = pd.read_csv("../dataset/raw_dataset.csv")
df_r = df_r.drop_duplicates(subset='review').reset_index(drop=True)
print("Raw dataset loaded and duplicates removed.")

# Prepare RAW reviews
raw_reviews = df_r['review'].fillna("").astype(str).tolist()

# RAW TF-IDF (char n-grams) â€” EXACTLY as your preprocessing
raw_tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3, 5),
    max_features=5000
)

X_raw = raw_tfidf.fit_transform(raw_reviews)

# Nearest Neighbors model (RAW)
raw_nn = NearestNeighbors(
    n_neighbors=5,
    metric='cosine',
    algorithm='brute'
)

raw_nn.fit(X_raw)

# Save models
joblib.dump(raw_tfidf, "../joblib/raw_tfidf_vectorizer.pkl")
joblib.dump(raw_nn, "../joblib/raw_nn_model.pkl")

print("RAW TF-IDF and NN model saved successfully")

Raw dataset loaded and duplicates removed.
RAW TF-IDF and NN model saved successfully


In [None]:
# Load preprocessed dataset (already lemmatized_text exists)
df_p = pd.read_csv("../dataset/preprocessed_dataset.csv")
print("Preprocessed dataset loaded successfully.")

lemm_reviews = df_p['lemmatized_text'].fillna("").astype(str)

# CLEAN TF-IDF (word uni + bi-gram)
lemm_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=0.05,
    max_df=0.9,
    stop_words='english'
)

X_clean = lemm_tfidf.fit_transform(lemm_reviews)

# Nearest Neighbors (clean text)
lemm_nn = NearestNeighbors(
    n_neighbors=5,
    metric='cosine',
    algorithm='brute'
)

lemm_nn.fit(X_clean)

# Save models
joblib.dump(lemm_tfidf, "../joblib/lemm_tfidf_vectorizer.pkl")
joblib.dump(lemm_nn, "../joblib/lemm_nn_model.pkl")
print("Lemmatized TF-IDF and NN model saved")

Preprocessed dataset loaded successfully.
Lemmatized TF-IDF and NN model saved


In [16]:
clean_reviews = df_p['clean_text'].fillna("").astype(str)

# CLEAN TF-IDF (word uni + bi-gram)
clean_tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=0.05,
    max_df=0.9,
    stop_words='english'
)

X_clean = clean_tfidf.fit_transform(clean_reviews)

# Build centroids
category_centroids = {}

for cat in df_p['category'].unique():
    idx = df_p[df_p['category'] == cat].index
    category_centroids[cat] = np.asarray(X_clean[idx].mean(axis=0))

# Save centroids
joblib.dump(category_centroids, "../joblib/category_centroids.pkl")

print("Category centroids saved")

Category centroids saved
