<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Balancing-data" data-toc-modified-id="Balancing-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Balancing data</a></span></li><li><span><a href="#Splitting-to-train-and-test" data-toc-modified-id="Splitting-to-train-and-test-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Splitting to train and test</a></span></li><li><span><a href="#Fitting-classifier" data-toc-modified-id="Fitting-classifier-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Fitting classifier</a></span></li><li><span><a href="#Model-Selection" data-toc-modified-id="Model-Selection-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Selection</a></span></li><li><span><a href="#Grid-Search-for-RandomForestClassifier" data-toc-modified-id="Grid-Search-for-RandomForestClassifier-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Grid Search for RandomForestClassifier</a></span></li></ul></div>

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import auc, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import confusion_matrix
import random
from imblearn.under_sampling import NearMiss
from sklearn.svm import LinearSVC
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
df_encoded = pd.read_csv('data/df_encoded.csv', index_col=0)

In [None]:
df_encoded.head()

In [None]:
df_encoded.shape

In [None]:
X = df_encoded.drop(columns=['Revenue_enc', 'PageValues'])
y = df_encoded['Revenue_enc']

print(f'Original dataset shape X: {len(X)}, y: {len(y)}')

## Balancing data

In [None]:
nr = NearMiss()
X_res, y_res = nr.fit_sample(X, y)

X_res=pd.DataFrame(X_res, columns=X.columns) 
y_res=pd.Series(y_res) 

print(f'Resampled dataset shape X: {len(X_res)}, y: {len(y_res)}')

## Splitting to train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, shuffle=True, stratify= y_res, random_state = 41)

## Fitting classifier

In [None]:
# Pipeline for transfromations, here I only add scaler, could be also missing value imputation

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [None]:
# In our case all the columns are already numeric, but here we could have 'num' and 'cat'

numeric_features = X_res.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

The next step is to create a pipeline that combines the preprocessor created above with a classifier.

In [None]:
lsvc = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [None]:
lsvc.fit(X_train, y_train)

In [None]:
y_pred = lsvc.predict(X_test)

## Model Selection

Using our pipeline for multiple classifiers

In [None]:
classifiers = [
    KNeighborsClassifier(3),
    LinearSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    ]

for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

## Grid Search for RandomForestClassifier

In [None]:
RandomForestClassifier().get_params().keys()

In [None]:
param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
    'classifier__criterion' :['gini', 'entropy']}

from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(lsvc, param_grid, n_jobs= 1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)