In [1]:
## I. IMPORTS & INSTALLS
##	a. GradientBoostingClassifier
#!python -m spacy download en_core_web_md
%pip install catboost
%pip install lightgbm
%pip install xgboost

## 	b. MODULES
import numpy as np
import pandas as pd
import seaborn as sns
import re

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm	import LGBMClassifier
import spacy

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV #, HalvingRandomSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from scipy.stats import randint, uniform

#import num2words
%matplotlib inline


Collecting catboost
  Downloading catboost-1.0.4-cp310-none-win_amd64.whl (73.5 MB)
Collecting graphviz
  Downloading graphviz-0.19.1-py3-none-any.whl (46 kB)
Collecting plotly
  Downloading plotly-5.6.0-py2.py3-none-any.whl (27.7 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly, graphviz, catboost
Successfully installed catboost-1.0.4 graphviz-0.19.1 plotly-5.6.0 tenacity-8.0.1
Note: you may need to restart the kernel to use updated packages.
Collecting lightgbmNote: you may need to restart the kernel to use updated packages.
  Downloading lightgbm-3.3.2-py3-none-win_amd64.whl (1.0 MB)

Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2
Note: you may need to restart the kernel to use updated packages.


  from pandas import MultiIndex, Int64Index


In [2]:

##	c. DATA
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')


In [9]:

## II. EDA
train.head()
# train.shape # >> (4087, 3)
# test.shape # >> (1022, 2)


Unnamed: 0,id,description,ratingCategory
0,1321,sometimes when whisky is batched a few leftove...,1
1,3861,an uncommon exclusive bottling of a year old c...,0
2,655,this release is a port version of amrut s inte...,1
3,555,this year old single cask was aged in a sherry...,1
4,1965,quite herbal on the nose with aromas of dried ...,1


In [10]:

## III. DEFINE TERMS
target = 'ratingCategory'
features = 'description'
X = train[features]
y = train[target]
assert len(X) == len(y)


In [3]:
def clean_doc(text):
	text = text.replace('\\n', ' ') # Remove new line chars, might need to make something like `\\r\n`. 
	text = re.sub('[^a-zA-Z ]', ' ', text) # Remove numbers.
	text = re.sub('[ ]{2,}', ' ', text) # Remove multiple white spaces.
	return text.lower().strip() # Might need `.lstrip().rstrip()`.
print(train['description'][0]) # Before.
train['description'] = train['description'].apply(clean_doc) # Pandas `apply` method goes through each element.
test['description'] = test['description'].apply(clean_doc)
print(train['description'][0]) # After.



Sometimes, when whisky is batched, a few leftover barrels are returned to the warehouse. Canadian Club recently pulled and vatted several of these from the 1970s. Acetone, Granny Smith apples, and fresh-cut white cedar showcase this long age. Complex and spicy, yet reserved, this dram is ripe with strawberries, canned pears, cloves, pepper, and faint flowers, then slightly pulling oak tannins. Distinct, elegant, and remarkably vibrant, this ancient Canadian Club is anything but tired. (Australia only) A$133
sometimes when whisky is batched a few leftover barrels are returned to the warehouse canadian club recently pulled and vatted several of these from the s acetone granny smith apples and fresh cut white cedar showcase this long age complex and spicy yet reserved this dram is ripe with strawberries canned pears cloves pepper and faint flowers then slightly pulling oak tannins distinct elegant and remarkably vibrant this ancient canadian club is anything but tired australia only a


In [11]:
##	b. SPLIT INTO FEATURE MATRIX AND TARGET VECTOR
X_test = test['description']
print(np.sort(np.unique(y))) # >> [0 1 2]
print(X.shape) # >> (4087,)
print(X_test.shape) # >> (1022,)
print(np.histogram(y, bins=[0.5, 1.5, 2.5, 3.5, 4.5])[0]) # >> [2881   65    0    0] # Check origin to find what each unique category these are # They were types not ratings in solution.


[0 1 2]
(4087,)
(1022,)
[2881   65    0    0]


In [17]:

## IV. PIPELINE
## 	a. DEFINE PIPELINE COMPONENTS
max_features = 500  # Speeds training compared to 2000
vect = TfidfVectorizer(stop_words = 'english', max_features = max_features, ngram_range = (1, 2)) # ngrams = (unigrams(one word), bigrams(two consecutive words).
clf = LGBMClassifier(learning_rate = 0.1, max_depth = -5, random_state = 42) # clf = classifier. LGBM = Light Gradient Boosting Classifier. Learning rate is rquired and recommended to be `0.1`.
pipe = Pipeline([('vect', vect), ('clf', clf)]) # vect = vectorizer, and comes first. 'vect' can be any name.

##	b. HYPERPARAMETERS
parameters = { # To specify parameter values `parameters` is a dictionary. Inside is a list of Keys and Values.'vect__max_features': ([500]), # Tried to 2000 without difference so probably good.  This is independent I think.
	'vect__analyzer': (['word']), # or 'characters' or other options.
	'vect__max_df': ([0.6]), # Independent like max features. Tried between 0.5 and 1.0. It means if words are more frequent than this you don't include them. This gets rid of common words.
	'clf__max_depth': ([10]), # These are the classifier parameters. GB algorithms are like RandomForest or Decision Trees, having max depth (of the tree) and...
	'clf__n_estimators': ([500]), # ... which is the number of trees.
	'clf__learning_rate': ([0.1])#, # Unique to GB. These last 3 are interconnected. Others can be trained one at a time.  Read docs to understand how to tune.
	# 'clf__num_leaves': ([50]), # Leaves are at the bottom of the decision tree. 
	# 'clf__min_data_in_leaf': ([30]) # Min data points in each leaf to prevent overfitting.
}


In [18]:

##	c. CROSS VALIDATION
grid_search = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, verbose=2) # Alternatively can use RandomizedSearchCV or HalvingRandomSearchCV. This is just running Cross Validation.  Its not really a Grid Search because theres only one set of parameters for each setting above. And fitting.
grid_search.fit(X, y)
##	d. PRINT RESULTS
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
{'clf__learning_rate': 0.1, 'clf__max_depth': 10, 'clf__n_estimators': 500, 'vect__analyzer': 'word', 'vect__max_df': 0.6}
0.7230259257224265


In [16]:
## V. MAKE PREDICTIONS
pred = grid_search.predict(test['description'])
submission = pd.DataFrame({'id': test['id'], 'ratingCategory':pred}) # Pandas DataFrame create from a dictionary technique.
submission['ratingCategory'] = submission['ratingCategory'].astype('int64')
submission.head() # Create categories and convert to integers.
submission_number = 4
submission.to_csv(f'submission{submission_number}.csv', index = False)