In [3]:
!pip -q install xgboost==2.0.3 flask==3.0.3 flask-ngrok==0.0.25 joblib==1.4.2
!pip install -U scikit-learn xgboost flask flask-ngrok joblib nltk




Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting flask
  Downloading flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting joblib
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading flask-3.1.2-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1

In [5]:
!pip install -U xgboost==2.1.0


Collecting xgboost==2.1.0
  Downloading xgboost-2.1.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 3.1.1
    Uninstalling xgboost-3.1.1:
      Successfully uninstalled xgboost-3.1.1
Successfully installed xgboost-2.1.0


In [8]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report

# GPU config for XGBoost ≥ 2.0
device_type = 'cuda'   # use GPU
tree_method = 'hist'   # efficient histogram algorithm

**Import + define pipeline**

In [9]:
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews
import pandas as pd

docs = []
labels = []
for cat in movie_reviews.categories():  # 'pos' | 'neg'
    for fid in movie_reviews.fileids(cat):
        docs.append(movie_reviews.raw(fid))
        labels.append(1 if cat == 'pos' else 0)

df = pd.DataFrame({'review': docs, 'sentiment': labels})
df.sample(5, random_state=42)
print(df['sentiment'].value_counts())


sentiment
0    1000
1    1000
Name: count, dtype: int64


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


**Train/ Test Split**

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'],
    test_size=0.2, random_state=42, stratify=df['sentiment']
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


Train size: 1600
Test size: 400


**Using xgboost ( XGBClassifier)**

In [14]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report

# GPU configuration for XGBoost ≥ 2.0
device_type = 'cuda'   # Use GPU
tree_method = 'hist'   # Efficient histogram algorithm

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=30000,
        ngram_range=(1, 2),
        stop_words='english',
        lowercase=True,
        strip_accents='unicode'      # ✅ Text cleaning
    )),
    ('clf', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        device=device_type,
        tree_method=tree_method,
        random_state=42
    ))
])


**Applying Grid Search(defined parameters)**

In [15]:
param_grid = {
    'tfidf__max_features': [15000, 30000],
    'clf__n_estimators': [200, 400, 600],
    'clf__max_depth': [4, 6, 8],
    'clf__learning_rate': [0.05, 0.1],
    'clf__subsample': [0.8, 1.0],
    'clf__colsample_bytree': [0.8, 1.0],
    'clf__scale_pos_weight': [1]   # adjust later if class imbalance occurs
}


In [None]:
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='f1',
    n_jobs=-1,      # use all cores
    cv=3,           # 3-fold cross-validation
    verbose=1
)

grid.fit(X_train, y_train)

print("✅ Best F1 (CV):", grid.best_score_)
print("✅ Best Params:", grid.best_params_)


Fitting 3 folds for each of 144 candidates, totalling 432 fits


**Conclusion Matrix**

In [None]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print("✅ Test Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Test F1:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))


In [12]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))


Test Accuracy: 0.53
Test F1: 0.15315315315315314
              precision    recall  f1-score   support

           0      0.516     0.975     0.675       200
           1      0.773     0.085     0.153       200

    accuracy                          0.530       400
   macro avg      0.644     0.530     0.414       400
weighted avg      0.644     0.530     0.414       400



Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




***Save for flask web app(model saving)***

In [13]:
import joblib, os

os.makedirs("artifacts", exist_ok=True)
joblib.dump(best_model, "artifacts/sentiment_xgb_pipeline.joblib")
print("Model saved at artifacts/sentiment_xgb_pipeline.joblib")


Model saved at artifacts/sentiment_xgb_pipeline.joblib
