# **Imports**

In [None]:
! pip install --upgrade scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBRFClassifier, XGBClassifier
from sklearn.metrics import classification_report

In [None]:
from pprint import pprint

# **Load Data**

In [None]:
dataset = np.load('/content/drive/MyDrive/Colab Notebooks/amazon/dataset.npz')

In [None]:
dataset.files

['train_embeddings', 'train_sentiment', 'test_embeddings', 'test_sentiment']

In [None]:
xtrain, xtest = dataset['train_embeddings'], dataset['test_embeddings']
ytrain, ytest = dataset['train_sentiment'], dataset['test_sentiment']

In [None]:
xtrain.shape

(18000, 40)

# **Dimentionality Reduction**

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(
    n_components=10,whiten=True, 
    svd_solver='auto', tol=0.0, 
    iterated_power='auto', n_oversamples=10, 
    power_iteration_normalizer='auto', 
    random_state=33
)

In [None]:
pca.fit(xtrain, ytrain)

In [None]:
pca.explained_variance_ratio_

array([0.07707273, 0.05935486, 0.05238611, 0.04740816, 0.0362368 ,
       0.03278863, 0.03086394, 0.02876552, 0.02774145, 0.02707267])

In [None]:
pca.transform(xtrain)

array([[ 1.70962668,  0.71973385, -1.5831303 , ..., -1.07751818,
        -0.03520121, -0.44035968],
       [-0.94327462, -1.2895529 ,  0.2986738 , ..., -0.31328363,
        -0.27325362, -0.27367544],
       [ 0.08840278, -0.29230319, -0.17339626, ..., -0.28411999,
        -0.16446469,  0.0535404 ],
       ...,
       [-0.03699793, -0.32522795, -0.23225836, ..., -0.26056812,
         0.17237214,  0.02192863],
       [ 1.67339662,  0.47135099, -1.93060999, ..., -1.96724546,
        -0.3926709 ,  0.22233185],
       [ 0.13611274, -0.47831107, -0.434717  , ..., -1.04006355,
         0.70917487,  0.17752259]])

In [None]:
xtrain_rd, xtest_rd = pca.fit_transform(xtrain), pca.fit_transform(xtest)

In [None]:
xtrain.shape

(18000, 40)

# **Support Vector Machine**

In [None]:
def train_evaluate(model, x, y) -> str:
    model.fit(x, y)
    ypred = model.predict(x)
    report = classification_report(
        ypred, y, output_dict=True
    )
    return model, report

In [None]:
def test_evaluate(model, x, y) -> str:
    ypred = model.predict(x)
    report = classification_report(
        ypred, y,  output_dict=True
    )
    return report

In [None]:
svc = SVC(
    C=2.5, kernel='rbf', degree=3, gamma='scale', 
    coef0=0.0, shrinking=True, probability=True, 
    tol=0.001, cache_size=200, class_weight=None, 
    verbose=False, max_iter=-1, decision_function_shape='ovo', 
    break_ties=False, random_state=33
)
svc

In [None]:
model_svc, report_svc = train_evaluate(
    svc,
    xtrain_rd, ytrain
)

In [None]:
pprint(report_svc)

{'0': {'f1-score': 0.9110895240042947,
       'precision': 0.9917631344612645,
       'recall': 0.8425531914893617,
       'support': 10575},
 '1': {'f1-score': 0.8942278450215925,
       'precision': 0.8153283052351376,
       'recall': 0.99003367003367,
       'support': 7425},
 'accuracy': 0.9033888888888889,
 'macro avg': {'f1-score': 0.9026586845129436,
               'precision': 0.9035457198482011,
               'recall': 0.9162934307615158,
               'support': 18000},
 'weighted avg': {'f1-score': 0.90413408142393,
                  'precision': 0.9189837674054873,
                  'recall': 0.9033888888888889,
                  'support': 18000}}


In [None]:
test_report = test_evaluate(model_svc, xtest_rd, ytest)

In [None]:
pprint(test_report)

{'0': {'f1-score': 0.8159600997506234,
       'precision': 0.8051181102362205,
       'recall': 0.8270980788675429,
       'support': 989},
 '1': {'f1-score': 0.8150375939849623,
       'precision': 0.8262195121951219,
       'recall': 0.8041543026706232,
       'support': 1011},
 'accuracy': 0.8155,
 'macro avg': {'f1-score': 0.8154988468677928,
               'precision': 0.8156688112156711,
               'recall': 0.815626190769083,
               'support': 2000},
 'weighted avg': {'f1-score': 0.8154937730860817,
                  'precision': 0.8157848689264452,
                  'recall': 0.8155,
                  'support': 2000}}


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lgrc = LogisticRegression(
    penalty='l2', dual=False, tol=0.0001, 
    C=2.0, fit_intercept=True, intercept_scaling=1, class_weight=None, 
    random_state=33, solver='lbfgs', max_iter=100, multi_class='auto', 
    verbose=0, warm_start=True, n_jobs=None, l1_ratio=None
)

In [None]:
model_lgrc, report_svc = train_evaluate(
    lgrc,
    xtrain_rd, ytrain
)
pprint(report_svc)

{'0': {'f1-score': 0.9056215604587768,
       'precision': 0.9799643811219947,
       'recall': 0.8417630748637537,
       'support': 10459},
 '1': {'f1-score': 0.8891707434921784,
       'precision': 0.8164374445430346,
       'recall': 0.9761304866728551,
       'support': 7541},
 'accuracy': 0.8980555555555556,
 'macro avg': {'f1-score': 0.8973961519754776,
               'precision': 0.8982009128325146,
               'recall': 0.9089467807683044,
               'support': 18000},
 'weighted avg': {'f1-score': 0.898729582084048,
                  'precision': 0.9114556795252202,
                  'recall': 0.8980555555555556,
                  'support': 18000}}


In [None]:
report_lgrc_test = test_evaluate(
    lgrc,
    xtest_rd, ytest
)
pprint(report_lgrc_test)

{'0': {'f1-score': 0.7594501718213058,
       'precision': 0.6525590551181102,
       'recall': 0.9082191780821918,
       'support': 730},
 '1': {'f1-score': 0.8136645962732918,
       'precision': 0.931910569105691,
       'recall': 0.7220472440944882,
       'support': 1270},
 'accuracy': 0.79,
 'macro avg': {'f1-score': 0.7865573840472988,
               'precision': 0.7922348121119006,
               'recall': 0.81513321108834,
               'support': 2000},
 'weighted avg': {'f1-score': 0.7938763313483169,
                  'precision': 0.829947266500224,
                  'recall': 0.79,
                  'support': 2000}}


# Naive Bayes

In [None]:
gnb = GaussianNB()
gnb

In [None]:
fitted_model, naive_report = train_evaluate(
    gnb, xtrain_rd, ytrain
)

In [None]:
pprint(naive_report)

{'0': {'f1-score': 0.8406452882156208,
       'precision': 0.951246660730187,
       'recall': 0.7530842439196335,
       'support': 11348},
 '1': {'f1-score': 0.7932090885882053,
       'precision': 0.6892191659272404,
       'recall': 0.934155141310884,
       'support': 6652},
 'accuracy': 0.82,
 'macro avg': {'f1-score': 0.816927188401913,
               'precision': 0.8202329133287137,
               'recall': 0.8436196926152587,
               'support': 18000},
 'weighted avg': {'f1-score': 0.823114977108867,
                  'precision': 0.8544129443174535,
                  'recall': 0.82,
                  'support': 18000}}


In [None]:
ypred_nb = fitted_model.predict(xtest_rd)
test_report_nb = classification_report(
    ypred_nb, ytest
)
print(test_report_nb)

              precision    recall  f1-score   support

           0       0.85      0.76      0.80      1144
           1       0.72      0.82      0.77       856

    accuracy                           0.78      2000
   macro avg       0.78      0.79      0.78      2000
weighted avg       0.79      0.78      0.79      2000



# Multi-Layer Perceptron

In [None]:
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 64, 32, 20, 10), activation='relu', solver='adam', alpha=0.0001,
    batch_size='auto', learning_rate='constant', learning_rate_init=0.001, 
    power_t=0.5, max_iter=1000, shuffle=True, random_state=33, tol=0.0001, 
    verbose=True, warm_start=True, momentum=0.9, nesterovs_momentum=True, 
    early_stopping=False, validation_fraction=0.2, beta_1=0.9, beta_2=0.999,
    epsilon=1e-08, n_iter_no_change=20, max_fun=15000
)

In [None]:
mlp_fitted_model, mlp_report = train_evaluate(
    mlp, xtrain_rd, ytrain
)

Iteration 1, loss = 0.44193625
Iteration 2, loss = 0.24410332
Iteration 3, loss = 0.23675202
Iteration 4, loss = 0.22948226
Iteration 5, loss = 0.22594188
Iteration 6, loss = 0.22630213
Iteration 7, loss = 0.22386193
Iteration 8, loss = 0.22239158
Iteration 9, loss = 0.22249199
Iteration 10, loss = 0.22089512
Iteration 11, loss = 0.22205935
Iteration 12, loss = 0.22074852
Iteration 13, loss = 0.22065486
Iteration 14, loss = 0.21890567
Iteration 15, loss = 0.21819066
Iteration 16, loss = 0.21937206
Iteration 17, loss = 0.21888885
Iteration 18, loss = 0.21729610
Iteration 19, loss = 0.21743893
Iteration 20, loss = 0.21673289
Iteration 21, loss = 0.21585416
Iteration 22, loss = 0.21587889
Iteration 23, loss = 0.21557092
Iteration 24, loss = 0.21591150
Iteration 25, loss = 0.21535462
Iteration 26, loss = 0.21579181
Iteration 27, loss = 0.21539807
Iteration 28, loss = 0.21540911
Iteration 29, loss = 0.21417454
Iteration 30, loss = 0.21499189
Iteration 31, loss = 0.21415246
Iteration 32, los

In [None]:
pprint(mlp_report)

{'0': {'f1-score': 0.9210160234942553,
       'precision': 0.994879786286732,
       'recall': 0.8573621103117506,
       'support': 10425},
 '1': {'f1-score': 0.9076005062985957,
       'precision': 0.8350709849157054,
       'recall': 0.9939273927392739,
       'support': 7575},
 'accuracy': 0.9148333333333334,
 'macro avg': {'f1-score': 0.9143082648964255,
               'precision': 0.9149753856012187,
               'recall': 0.9256447515255122,
               'support': 18000},
 'weighted avg': {'f1-score': 0.9153703266744152,
                  'precision': 0.9276269157097583,
                  'recall': 0.9148333333333334,
                  'support': 18000}}


In [None]:
mlp_test_report = test_evaluate(
    mlp_fitted_model, xtest_rd, ytest
)

In [None]:
pprint(mlp_test_report)

{'0': {'f1-score': 0.610146862483311,
       'precision': 0.4498031496062992,
       'recall': 0.9481327800829875,
       'support': 482},
 '1': {'f1-score': 0.7665867306155076,
       'precision': 0.9745934959349594,
       'recall': 0.6317523056653491,
       'support': 1518},
 'accuracy': 0.708,
 'macro avg': {'f1-score': 0.6883667965494094,
               'precision': 0.7121983227706292,
               'recall': 0.7899425428741683,
               'support': 2000},
 'weighted avg': {'f1-score': 0.7288847223956483,
                  'precision': 0.8481190224697522,
                  'recall': 0.708,
                  'support': 2000}}


# XG BOOSt

In [None]:
! pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from xgboost import XGBClassifier

In [None]:
bst = XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, monotone_constraints=None,
              n_estimators=50, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=33)
bst

In [None]:
bst_fitted_model, bst_report = train_evaluate(
    bst, xtrain_rd, ytrain
)

In [None]:
pprint(bst_report)

{'0': {'f1-score': 0.922089351348553,
       'precision': 0.9913178984861977,
       'recall': 0.8618987709280944,
       'support': 10333},
 '1': {'f1-score': 0.9097884073607864,
       'precision': 0.8417258207630879,
       'recall': 0.9898265292813356,
       'support': 7667},
 'accuracy': 0.9163888888888889,
 'macro avg': {'f1-score': 0.9159388793546697,
               'precision': 0.9165218596246427,
               'recall': 0.925862650104715,
               'support': 18000},
 'weighted avg': {'f1-score': 0.9168498325955414,
                  'precision': 0.9275999840471375,
                  'recall': 0.9163888888888889,
                  'support': 18000}}


In [None]:
bst_test_report = test_evaluate(
    bst_fitted_model, xtest_rd, ytest
)

In [None]:
pprint(bst_test_report)

{'0': {'f1-score': 0.6309601567602874,
       'precision': 0.47539370078740156,
       'recall': 0.9378640776699029,
       'support': 515},
 '1': {'f1-score': 0.7711624139327663,
       'precision': 0.967479674796748,
       'recall': 0.641077441077441,
       'support': 1485},
 'accuracy': 0.7175,
 'macro avg': {'f1-score': 0.7010612853465268,
               'precision': 0.7214366877920748,
               'recall': 0.789470759373672,
               'support': 2000},
 'weighted avg': {'f1-score': 0.735060332710853,
                  'precision': 0.8407675364893413,
                  'recall': 0.7175,
                  'support': 2000}}


In [None]:
grid = RandomizedSearchCV(
    bst,
    param_distributions = {
        'n_estimators':[50, 75, 100, 150, 200],
        'max_depth':[3, 4, 5], 
        'learning_rate':[1, 2,3], 
    }, 
    cv=20, 
    scoring=['accuracy', 'f1', 'precision', 'recall'], 
    random_state=33, 
    n_iter=2, 
    refit='accuracy', 
    verbose=1
)
grid.fit(xtrain_rd, ytrain)

Fitting 20 folds for each of 2 candidates, totalling 40 fits


In [None]:
grid.best_estimator_

In [None]:
grid.best_score_

0.9018888888888886

# Store Metrics

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# metrics = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amazon/Metrics.csv')
# metrics

In [None]:
def get_metrics(model, x, y, model_name, type_) :
  global metrics
  ypred = model.predict(x)
  scores = pd.Series(
      [
          model_name, 
          accuracy_score(y_true=y, y_pred=ypred),
          precision_score(y_true=y, y_pred=ypred),
          recall_score(y_true=y, y_pred=ypred),
          f1_score(y_true=y, y_pred=ypred),
          roc_auc_score(y_true=y, y_score=ypred),
          type_
      ], index=metrics.columns
  )
  metrics = metrics.append(
      scores, ignore_index=True
  )

In [None]:
get_metrics(model_svc, xtrain_rd, ytrain, model_name='SVC', type_='train')
get_metrics(model_svc, xtest_rd, ytest, model_name='SVC', type_='test')

In [None]:
get_metrics(bst, xtrain_rd, ytrain, model_name='XGBClassifier', type_='train')
get_metrics(bst, xtest_rd, ytest, model_name='XGBClassifier', type_='test')

In [None]:
get_metrics(fitted_model, xtrain_rd, ytrain, model_name='NaiveBayes', type_='train')
get_metrics(fitted_model, xtest_rd, ytest, model_name='NaiveBayes', type_='test')

In [None]:
get_metrics(mlp_fitted_model, xtrain_rd, ytrain, model_name='MLP', type_='train')
get_metrics(mlp_fitted_model, xtest_rd, ytest, model_name='MLP', type_='test')

In [None]:
get_metrics(grid.best_estimator_, xtrain_rd, ytrain, model_name='XGBClassifier', type_='train')
get_metrics(grid.best_estimator_, xtest_rd, ytest, model_name='XGBClassifier', type_='test')

In [None]:
get_metrics(lgrc, xtrain_rd, ytrain, model_name='LogisticRegression', type_='train')
get_metrics(lgrc, xtest_rd, ytest, model_name='LogisticRegression', type_='test')

In [None]:
metrics

Unnamed: 0,model,accuracy,precision,recall,f1_score,roc_auc_score,type
0,SVC,0.903389,0.990034,0.815328,0.894228,0.903546,train
1,SVC,0.815500,0.804154,0.826220,0.815038,0.815669,test
2,XGBClassifier,0.916389,0.989827,0.841726,0.909788,0.916522,train
3,XGBClassifier,0.717500,0.641077,0.967480,0.771162,0.721437,test
4,NaiveBayes,0.820000,0.934155,0.689219,0.793209,0.820233,train
...,...,...,...,...,...,...,...
67,MLP,0.708000,0.631752,0.974593,0.766587,0.712198,test
68,XGBClassifier,0.916389,0.989827,0.841726,0.909788,0.916522,train
69,XGBClassifier,0.717500,0.641077,0.967480,0.771162,0.721437,test
70,LogisticRegression,0.898056,0.976130,0.816437,0.889171,0.898201,train


In [None]:
metrics.to_csv(
    '/content/drive/MyDrive/Colab Notebooks/amazon/Metrics.csv', index=False
)