<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Balancing-data" data-toc-modified-id="Balancing-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Balancing data</a></span></li><li><span><a href="#Splitting-to-train-and-test" data-toc-modified-id="Splitting-to-train-and-test-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Splitting to train and test</a></span></li><li><span><a href="#Fitting-classifier" data-toc-modified-id="Fitting-classifier-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Fitting classifier</a></span></li><li><span><a href="#Model-Selection" data-toc-modified-id="Model-Selection-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Selection</a></span></li><li><span><a href="#Grid-Search-for-RandomForestClassifier" data-toc-modified-id="Grid-Search-for-RandomForestClassifier-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Grid Search for RandomForestClassifier</a></span></li></ul></div>

In [11]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import auc, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import confusion_matrix
import random
from imblearn.under_sampling import NearMiss
from sklearn.svm import LinearSVC
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [100]:
df_encoded = pd.read_csv('data/df_encoded.csv', index_col=0)

In [80]:
df_encoded.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,x0_4.0TrafficType,x0_5.0TrafficType,x0_6.0TrafficType,x0_8.0TrafficType,x0_9.0TrafficType,x0_10.0TrafficType,x0_11.0TrafficType,x0_13.0TrafficType,x0_20.0TrafficType,x0_99.0TrafficType
0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,2.0,64.0,0.0,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,64.0,0.2,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,2.666667,0.05,0.14,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
df_encoded.shape

(12330, 59)

In [8]:
df_encoded.dtypes

Administrative             float64
Administrative_Duration    float64
Informational              float64
Informational_Duration     float64
ProductRelated             float64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
AnyPage                    float64
Visitor_isReturning        float64
Revenue_enc                  int64
Weekend_enc                  int64
x0_AugMonth                float64
x0_DecMonth                float64
x0_FebMonth                float64
x0_JulMonth                float64
x0_JuneMonth               float64
x0_MarMonth                float64
x0_MayMonth                float64
x0_NovMonth                float64
x0_OctMonth                float64
x0_SepMonth                float64
x0_1.0OperatingSystems     float64
x0_2.0OperatingSystems     float64
x0_3.0OperatingSystems     float64
x0_4.0OperatingSystems     float64
x0_99.0OperatingSyst

In [101]:
X = df_encoded.drop(columns=['Revenue_enc', 'PageValues'])
y = df_encoded['Revenue_enc']

print(f'Original dataset shape X: {len(X)}, y: {len(y)}')

Original dataset shape X: 12330, y: 12330


## Balancing data

In [109]:
nr = NearMiss()
X_res, y_res = nr.fit_sample(X, y)

X_res=pd.DataFrame(X_res, columns=X.columns) 
y_res=pd.Series(y_res) 

print(f'Resampled dataset shape X: {len(X_res)}, y: {len(y_res)}')

Resampled dataset shape X: 3816, y: 3816


## Splitting to train and test

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.2, shuffle=True, stratify= y_res, random_state = 41)

## Fitting classifier

In [112]:
# Pipeline for transfromations, here I only add scaler, could be also missing value imputation

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [135]:
# In our case all the columns are already numeric, but here we could have 'num' and 'cat'

numeric_features = X_res.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

The next step is to create a pipeline that combines the preprocessor created above with a classifier.

In [129]:
lsvc = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [130]:
lsvc.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_D...
                 RandomForestClassifier(boo

In [131]:
y_pred = lsvc.predict(X_test)

## Model Selection

Using our pipeline for multiple classifiers

In [140]:
classifiers = [
    KNeighborsClassifier(3),
    LinearSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    ]

for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
model score: 0.758
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
model score: 0.929
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
model score: 0.927
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_fe

## Grid Search for RandomForestClassifier

In [124]:
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [133]:
param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [4,5,6,7,8],
    'classifier__criterion' :['gini', 'entropy']}

from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(lsvc, param_grid, n_jobs= 1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)

{'classifier__criterion': 'entropy', 'classifier__max_depth': 8, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 500}
0.9393840104849279
