# Ensemble Models
## Hyperparameter Tuning

## Imports

In [None]:
## Basic Imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set()

In [None]:
# NLP processing 
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
# sklearn models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

## Constants

In [None]:
# File path
SAMPLE_PATH = '../input/amazon-reviews-2018-electronics/electronics_sample.csv'

# Dtypes and data column
DTYPES = {
    'overall':np.int16,
    'vote':np.int64,
    'verified':bool,
    'reviewText':object,
    'summary':object
}

## Functions

In [None]:
# preprocessing text
def lemma(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.is_alpha and token.lemma_ != '-PRON-'])

## Load data

In [None]:
## Loading Data
df = pd.read_csv(SAMPLE_PATH, dtype=DTYPES, parse_dates=[2]).dropna()
df.head()

In [None]:
df.info()

In [None]:
sns.countplot(x='overall', data=df)
plt.show()

In [None]:
# Getting read of reviews that include information about the number of stars or 
# those below 6 words
war_1 = df.reviewText.str.contains('one star')
war_2 = df.reviewText.str.contains('two star')
war_3 = df.reviewText.str.contains('three star')
war_4 = df.reviewText.str.contains('four star')
war_5 = df.reviewText.str.contains('five star')
war_6 = (df.reviewText.str.split().str.len() > 5)

mask = (~war_1 & ~war_2 & ~war_3 & ~war_4 & ~war_5 & war_6)
df = df[mask]
df.info()

## Baseline Model

In [None]:
X = df['reviewText']
y = df['overall']-1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
# creating tfidf model
tfidf = TfidfVectorizer(min_df=0.01, max_df=1., ngram_range=(1,1), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
print('Vocabulary Size: ', len(tfidf.get_feature_names()))

In [None]:
%%time
gb = GradientBoostingClassifier(n_estimators=50, max_depth=20)
gb.fit(X_train_tfidf, y_train)
gb.score(X_test_tfidf, y_test)

In [None]:
y_test_pred_gb = gb.predict(X_test_tfidf)

In [None]:
cl_report_gb = pd.DataFrame(classification_report(y_test, y_test_pred_gb, output_dict=True)).T
display(cl_report_gb)

In [None]:
cm_gb = confusion_matrix(y_test, y_test_pred_gb, normalize='true')
sns.heatmap(cm_gb, cmap='Greens',
                yticklabels=range(1,6),
                xticklabels=range(1,6),
                annot=True, 
                fmt='.2f'
                )
plt.show()

## Hyperparameter Tuning
## Informed Search - Coarse to Fine 
### Randomized Search to Grid Search

In [None]:
# Creating Pipeline

steps = [('vectorizer', TfidfVectorizer()),
         ('classifier', GradientBoostingClassifier())]

pipe = Pipeline(steps)

In [None]:
params = {#'vectorizer': [TfidfVectorizer(), CountVectorizer()],
          'vectorizer__max_df':[0.8,0.9,0.99,1.],
          'vectorizer__min_df':[0.001, 0.01, 0.],
          'vectorizer__max_features':[1_000, 10_000, None],
          'vectorizer__ngram_range':[(1,1),(1,2)],
          #'classifier': [RandomForestClassifier(), GradientBoostingClassifier()],
          'classifier__n_estimators':[50,100,200],
          'classifier__max_depth':[5,8,10,20,50,100],
          'classifier__max_features':[0.4,0.6,0.8,1.],
          'classifier__subsample':[0.4,0.6,0.8,1.]
           }

In [None]:
search = RandomizedSearchCV(pipe, 
                            param_distributions=params,
                            n_iter=50,
                            cv=3)

In [None]:
search.fit(X_train[:2_000], y_train[:2_000])

In [None]:
search.best_params_

In [None]:
result = pd.DataFrame(search.cv_results_)
result.to_csv('rand_search_result.csv')