# Importing libraries

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import make_column_transformer

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score, roc_curve

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, RandomizedSearchCV

from sklearn.svm import SVR, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier
from sklearn import tree

import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
# Filter out all warnings
warnings.filterwarnings("ignore")

In [4]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
     ---------------------------------------- 0.0/250.0 kB ? eta -:--:--
     - -------------------------------------- 10.2/250.0 kB ? eta -:--:--
     ---- -------------------------------- 30.7/250.0 kB 330.3 kB/s eta 0:00:01
     ------------- ----------------------- 92.2/250.0 kB 655.4 kB/s eta 0:00:01
     ---------------------------------- --- 225.3/250.0 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 250.0/250.0 kB 1.1 MB/s eta 0:00:00
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


# Reading the corpus

*   Reading training data from [corpus.xlsx](https://docs.google.com/spreadsheets/d/1fYERAxi5YWi0eP7M5txpOr0sfNMuNuKm/edit?usp=drive_web&ouid=105822911867262178394&rtpof=true)
*   Reading small training data from [corpus-small.xlsx](https://docs.google.com/spreadsheets/d/1pcb1Zop1TEp1p9Ato3EyRgt42JgpfiKV/edit?usp=drive_web&ouid=105822911867262178394&rtpof=true)
*   Reading test data from [corpus-test.xlsx](https://docs.google.com/spreadsheets/d/1HHQn69_gzqE7ibNMgqQwseY--7SiatwG/edit?usp=drive_web&ouid=105822911867262178394&rtpof=true)

## Training data

In [5]:
df = pd.read_excel('./data/corpus.xlsx')
print(df.shape)
df.head()

(493829, 7)


Unnamed: 0,reviewText,summary,overall,numberofWords,numberofSentences,reviewLength,summaryLength
0,nearly large enough front rear 2005 dodge ram ...,nearly large enough front rear,1,26,1,150,55
1,2 3 broke first day wearing glad kept spare ba...,glad kept spare back set goggles,1,30,1,158,46
2,bearing groaned whined box ended sourcing delc...,one star,1,10,1,70,8
3,think product good claimed bottom line efficie...,would buy,1,46,1,304,24
4,poor reception,one star,1,2,1,14,8


## Verifying no missing value for training set

In [6]:
df.isnull().sum()

reviewText           0
summary              0
overall              0
numberofWords        0
numberofSentences    0
reviewLength         0
summaryLength        0
dtype: int64

In [7]:
df = df.dropna(subset=['reviewText', 'summary', 'overall'])
print(df.shape)

(493829, 7)


## Reading small training data

In [8]:
df_small = pd.read_excel('./data/corpus-small.xlsx')
print(df_small.shape)
df_small.head()

(89234, 7)


Unnamed: 0,reviewText,summary,overall,numberofWords,numberofSentences,reviewLength,summaryLength
0,didnt get use broken received,one star,1,5,1,29,8
1,brand acrylic craft paint worst coverage paint...,brand acrylic craft paint worst coverage paint...,1,20,1,120,81
2,bought threading small seed bead felting much ...,good beading felted project,1,17,1,111,36
3,wanted decorate mug mad hatter tea wish hadnt ...,total dud,1,29,1,181,9
4,caused air bubble painttried different paint a...,bubble,1,13,1,92,7


In [None]:
df_small['overall'].value_counts()

## Verifying no missing value for small training set

In [9]:
df_small.isnull().sum()

reviewText            35
summary              378
overall                0
numberofWords          0
numberofSentences      0
reviewLength           0
summaryLength          0
dtype: int64

In [10]:
df_small = df_small.dropna(subset=['reviewText', 'summary', 'overall'])
print(df_small.shape)

(88821, 7)


## Reading test data

In [11]:
df_test = pd.read_excel('./data/corpus-test.xlsx', usecols=['reviewText', 'summary', 'overall'])
print(df_test.shape)
df_test.head()

(23720, 3)


Unnamed: 0,reviewText,summary,overall
0,received didnt check saw need ran went use spr...,bad wont spray,1
1,didnt work didnt get warm buy save money buy n...,save money,1
2,must obeyed made clear like product period lis...,chemical mostlyvery little melon,1
3,name shade clearly pink left pink absolutely c...,color expected,1
4,came inside plastic bag half bottle spilled ev...,totally wasted half product,1


## Verifying no missing value for test set

In [12]:
df_test.isnull().sum()

reviewText    0
summary       0
overall       0
dtype: int64

In [13]:
df_test = df_test.dropna(subset=['reviewText', 'summary', 'overall'])
print(df_test.shape)

(23720, 3)


In [None]:
# print(vectorizer.vocabulary_)

# Preparing Training and Test set

## Main set

In [31]:
# Create the feature matrix X
X = df[['reviewText', 'summary']]

# Create the target variable y
y = df['overall']

X_test = df_test[['reviewText', 'summary']]
y_test = df_test['overall']

# Print the shapes of the resulting sets
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X shape: (493829, 2)
y shape: (493829,)
X_test shape: (23720, 2)
y_test shape: (23720,)


In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Small set

In [11]:
# Create the feature matrix X
X_train_small = df_small[['reviewText', 'summary']]

# Create the target variable y
y_train_small = df_small['overall']

X_test = df_test[['reviewText', 'summary']]
y_test = df_test['overall']

# Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting sets
print("X_train shape:", X_train_small.shape)
print("y_train shape:", y_train_small.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (88821, 2)
y_train shape: (88821,)
X_test shape: (23720, 2)
y_test shape: (23720,)


In [15]:
# Vectorization for main set
vectorizer = CountVectorizer()
transformer = make_column_transformer((vectorizer, 'reviewText'), (vectorizer, 'summary'))

X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [12]:
# Vectorization for small set
vectorizer = CountVectorizer()
transformer = make_column_transformer((vectorizer, 'reviewText'), (vectorizer, 'summary'))

X_train_small = transformer.fit_transform(X_train_small)
X_test = transformer.transform(X_test)

# Exploring different models

In [16]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [17]:
def explore_models(X_train, y_train, X_test, y_test):
  for i in tqdm.tqdm(range(len(models))):

    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performence
    model_train_accuracy = accuracy_score(y_true = y_train, y_pred = y_train_pred)
    model_train_f1 = f1_score(y_true = y_train, y_pred = y_train_pred, average='weighted')
    model_train_precision = precision_score(y_true = y_train, y_pred = y_train_pred, average='weighted')
    model_train_recall = recall_score(y_true = y_train, y_pred = y_train_pred, average='weighted')
    # model_train_roc_auc_score = roc_auc_score(y_true = y_train, y_score = y_train_pred, average='weighted')

    # Test set performence
    model_test_accuracy = accuracy_score(y_true = y_test, y_pred = y_test_pred)
    model_test_f1 = f1_score(y_true = y_test, y_pred = y_test_pred, average='weighted')
    model_test_precision = precision_score(y_true = y_test, y_pred = y_test_pred, average='weighted')
    model_test_recall = recall_score(y_true = y_test, y_pred = y_test_pred, average='weighted')
    # model_test_roc_auc_score = roc_auc_score(y_true = y_train, y_score = y_test_pred, average='weighted')

    print('\n')
    print(list(models.keys())[i])

    print('Model performence for Training set')
    #print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print(f"- Accuracy: {model_train_accuracy:.4f}")
    print(f"- F1 score: {model_train_f1:.4f}")
    print(f"- Precision: {model_train_precision:.4f}")
    print(f"- Recall: {model_train_recall:.4f}")
    # print(f"- ROC AUC score: {model_train_roc_auc_score:.4f}")

    print('Model performence for Test set')
    print(f"- Accuracy: {model_test_accuracy:.4f}")
    print(f"- F1 score: {model_test_f1:.4f}")
    print(f"- Precision: {model_test_precision:.4f}")
    print(f"- Recall: {model_test_recall:.4f}")
    # print(f"- ROC AUC score: {model_test_roc_auc_score:.4f}")

    print('=' * 35)
    print('\n')

Test on a small test on default settings for the provided models

In [18]:
explore_models(X_train = X_train_small, y_train = y_train_small, X_test = X_test, y_test = y_test)

 33%|███▎      | 1/3 [03:39<07:19, 219.70s/it]



Decision Tree
Model performence for Training set
- Accuracy: 0.9999
- F1 score: 0.9999
- Precision: 0.9999
- Recall: 0.9999
Model performence for Test set
- Accuracy: 0.4161
- F1 score: 0.4601
- Precision: 0.4161
- Recall: 0.5562




 67%|██████▋   | 2/3 [30:40<17:23, 1043.80s/it]



Random Forest
Model performence for Training set
- Accuracy: 0.9999
- F1 score: 0.9999
- Precision: 0.9999
- Recall: 0.9999
Model performence for Test set
- Accuracy: 0.5446
- F1 score: 0.5678
- Precision: 0.5446
- Recall: 0.6073




100%|██████████| 3/3 [35:32<00:00, 710.93s/it]



Gradient Boosting
Model performence for Training set
- Accuracy: 0.5525
- F1 score: 0.5539
- Precision: 0.5525
- Recall: 0.5702
Model performence for Test set
- Accuracy: 0.5468
- F1 score: 0.5783
- Precision: 0.5468
- Recall: 0.6332







# Hyperparameter tuning

## Searching optimal params for small set

In [23]:
# Specify the number of records you want to use
# num_records = 90000

# Create the feature matrix X
X = df_small[['reviewText', 'summary']]

# Create the target variable y
y = df_small['overall']

# Use only the first num_records records for training and validation
# X_train, X_val, y_train, y_val = train_test_split(X.iloc[:num_records], y.iloc[:num_records], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Similarly, use only the first num_records records for testing
X_test = df_test[['reviewText', 'summary']].iloc[:num_records]
y_test = df_test['overall'].iloc[:num_records]

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (71056, 2)
y_train shape: (71056,)
X_val shape: (17765, 2)
y_val shape: (17765,)
X_test shape: (23720, 2)
y_test shape: (23720,)


In [24]:
# Vectorization for main set
vectorizer = CountVectorizer()
transformer = make_column_transformer((vectorizer, 'reviewText'), (vectorizer, 'summary'))

X_train = transformer.fit_transform(X_train)
X_val = transformer.transform(X_val)
X_test = transformer.transform(X_test)

In [25]:
n_estimators_list = [50, 200, 500]
min_samples_split_list = [15, 20, 25]
min_samples_leaf_list = [5, 10, 15]

hyperparameter_score_list = []
results = []

for n_estimators in tqdm.tqdm(n_estimators_list):
    for min_samples_split in min_samples_split_list:
        for min_samples_leaf in min_samples_leaf_list:
            gbt_current = GradientBoostingClassifier(
                n_estimators=n_estimators,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                )
            gbt_current.fit(X_train, y_train)
            y_val_hat = gbt_current.predict(X_val)
            acc = accuracy_score(y_val, y_val_hat)

            results.append([acc, n_estimators, min_samples_split, min_samples_leaf])

results = pd.DataFrame(results)
results.columns = ['Accuracy', 'n_estimators', 'min_samples_split', 'min_samples_leaf']
print(results)

100%|██████████████████████████████████████████████████████████████████████████████| 3/3 [10:03:08<00:00, 12062.91s/it]


    Accuracy  n_estimators  min_samples_split  min_samples_leaf
0   0.515227            50                 15                 5
1   0.513988            50                 15                10
2   0.513763            50                 15                15
3   0.514382            50                 20                 5
4   0.513988            50                 20                10
5   0.513763            50                 20                15
6   0.516127            50                 25                 5
7   0.514382            50                 25                10
8   0.513763            50                 25                15
9   0.549733           200                 15                 5
10  0.551646           200                 15                10
11  0.548776           200                 15                15
12  0.550802           200                 20                 5
13  0.550239           200                 20                10
14  0.548776           200              

In [26]:
# Extract best parameters
n_estimators_optimal = results.loc[results['Accuracy'].idxmax()]['n_estimators'].astype(int)
min_samples_split_optimal = results.loc[results['Accuracy'].idxmax()]['min_samples_split'].astype(int)
min_samples_leaf_optimal = results.loc[results['Accuracy'].idxmax()]['min_samples_leaf'].astype(int)

In [27]:
print(n_estimators_optimal)
print(min_samples_split_optimal)
print(min_samples_leaf_optimal)

500
25
10


# Training on full set with optimal params

In [32]:
# Vectorization for full set
vectorizer = CountVectorizer()
transformer = make_column_transformer((vectorizer, 'reviewText'), (vectorizer, 'summary'))

X = transformer.fit_transform(X)
X_test = transformer.transform(X_test)

In [35]:
# Initialize the final model
gbt_optimal = GradientBoostingClassifier(
                n_estimators=n_estimators_optimal,
                min_samples_split=min_samples_split_optimal,
                min_samples_leaf=min_samples_leaf_optimal,
                )

gbt_optimal.fit(X, y)

In [36]:
y_test_hat = gbt_optimal.predict(X_test)
# acc = accuracy_score(y_test, y_test_hat)

In [41]:
y_train_pred = gbt_optimal.predict(X)

## Accuracy, precision, recall, f1-score after final training

In [43]:
train_accuracy = accuracy_score(y_true = y, y_pred = y_train_pred)
train_f1 = f1_score(y_true = y, y_pred = y_train_pred, average='weighted')
train_precision = precision_score(y_true = y, y_pred = y_train_pred, average='weighted')
train_recall = recall_score(y_true = y, y_pred = y_train_pred, average='weighted')

In [44]:
test_accuracy = accuracy_score(y_true = y_test, y_pred = y_test_hat)
test_f1 = f1_score(y_true = y_test, y_pred = y_test_hat, average='weighted')
test_precision = precision_score(y_true = y_test, y_pred = y_test_hat, average='weighted')
test_recall = recall_score(y_true = y_test, y_pred = y_test_hat, average='weighted')

In [47]:
print('Model performance for Training set:')
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1 score: {train_f1:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")

Model performance for Training set:
Accuracy: 0.5810
F1 score: 0.5776
Precision: 0.5784
Recall: 0.5810


In [48]:
print('Model performance for Test set:')
print(f"Accuracy: {test_accuracy:.4f}")
print(f"F1 score: {test_f1:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")

Model performance for Test set:
Accuracy: 0.5510
F1 score: 0.5773
Precision: 0.6407
Recall: 0.5510
