In [25]:
!pip install xgboost
!pip install catboost

Collecting xgboost
  Downloading xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
    --------------------------------------- 1.3/72.0 MB 5.6 MB/s eta 0:00:13
   - -------------------------------------- 2.6/72.0 MB 6.0 MB/s eta 0:00:12
   -- ------------------------------------- 3.7/72.0 MB 6.1 MB/s eta 0:00:12
   -- ------------------------------------- 4.7/72.0 MB 5.8 MB/s eta 0:00:12
   --- ------------------------------------ 6.0/72.0 MB 5.8 MB/s eta 0:00:12
   ---- ----------------------------------- 7.3/72.0 MB 5.8 MB/s eta 0:00:12
   ---- ----------------------------------- 8.9/72.0 MB 6.0 MB/s eta 0:00:11
   ----- ---------------------------------- 10.2/72.0 MB 6.0 MB/s eta 0:00:11
   ------ --------------------------------- 11.5/72.0 MB 6.0 MB/s eta 0:00:11
   ------- -------


[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
   ---------------------------------------- 0.5/102.4 MB 3.4 MB/s eta 0:00:31
    --------------------------------------- 1.6/102.4 MB 4.9 MB/s eta 0:00:21
   - -------------------------------------- 2.6/102.4 MB 5.0 MB/s eta 0:00:20
   - -------------------------------------- 3.7/102.4 MB 4.8 MB/s eta 0:00:21
   - -------------------------------------- 4.7/102.4 MB 4.8 MB/s eta 0:00:21
   -- ------------------------------------- 5.8/102.4 MB 4.8 MB/s eta 0:00:21
   -- ------------------------------------- 6.6/102.4 MB 4.8 MB/s eta 0:00:20
   -- ------------------------------------- 7.6/102.4 MB 4.7 MB/s eta 0:00:20
   --- ------------------------------------ 8.7/10


[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, f1_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier



In [19]:
df = pd.read_csv("../data/cleaned_reviews.csv")
df.head()



Unnamed: 0,Review text,Ratings,sentiment,clean_review
0,"Nice product, good quality, but price is now r...",4,1,nice product good quality price rising bad sig...
1,They didn't supplied Yonex Mavis 350. Outside ...,1,0,didnt supplied yonex mavis outside cover yonex...
2,Worst product. Damaged shuttlecocks packed in ...,1,0,worst product damaged shuttlecock packed new b...
3,"Quite O. K. , but nowadays the quality of the...",3,0,quite k nowadays quality cork like year back u...
4,Over pricedJust â?¹620 ..from retailer.I didn'...,1,0,pricedjust retaileri didnt understand wat adva...


In [20]:
X_text = df["clean_review"]
y = df["sentiment"]

In [21]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(X_text)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [27]:
models = {
    "Logistic Regression": LogisticRegression(
        class_weight="balanced",
        max_iter=1000
    ),

    "Naive Bayes": MultinomialNB(),

    "Linear SVM": LinearSVC(
        class_weight="balanced"
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ),

    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    ),

    "CatBoost": CatBoostClassifier(
        iterations=300,
        depth=6,
        learning_rate=0.1,
        loss_function="Logloss",
        class_weights=[
            y_train.value_counts()[1] / len(y_train),
            y_train.value_counts()[0] / len(y_train)
        ],
        verbose=0
    )
}


In [28]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)

    results.append({
        "Model": name,
        "F1_Negative (0)": report["0"]["f1-score"],
        "F1_Positive (1)": report["1"]["f1-score"],
        "Macro_F1": report["macro avg"]["f1-score"]
    })


In [29]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="Macro_F1", ascending=False)


Unnamed: 0,Model,F1_Negative (0),F1_Positive (1),Macro_F1
2,Linear SVM,0.646526,0.914661,0.780593
0,Logistic Regression,0.643243,0.900901,0.772072
3,Random Forest,0.603448,0.918555,0.761002
5,CatBoost,0.614362,0.890649,0.752505
4,XGBoost,0.60184,0.885358,0.743599
1,Naive Bayes,0.521385,0.919327,0.720356


In [32]:
best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]


In [33]:

pickle.dump(best_model, open("../model/sentiment_model.pkl", "wb"))
pickle.dump(tfidf, open("../model/tfidf_vectorizer.pkl", "wb"))