# Imports

## Libraries

In [None]:
import random #can delete thsi when have sosas csv

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import auc, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.model_selection import GridSearchCV

In [None]:
plt.rcParams["figure.figsize"] = (10, 10)

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

## Functions

In [None]:
#### ?
# check if can have log loss for training and if it's ok to do it like this for train and test data

In [None]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))

    df_model_train = pd.DataFrame({'data'        : 'training',
                             'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'logloss'      : 'N/A'})
    
    accuracy     = np.mean(cross_val_score(model, X_test, y_test, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_test, y_test, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_test, y_test, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_test, y_test, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_test, y_test, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    logloss      = log_loss(y_test, y_pred)   # SVC & LinearSVC unable to use cvs

    df_model_test = pd.DataFrame({'data'        : 'test',
                             'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'logloss'      : [logloss]})   # timetaken: to be used for comparison later
    
    df_model = pd.concat([df_model_train, df_model_test])

    return df_model

## Data

In [None]:
# Importing clean dataframe (with NaNs filled)

#df = pd.read_csv('data/xx.csv')

In [None]:
###### To delete this step when have the dataframe from sosa

# since don't have yeat the real df, will upload the old one

df = pd.read_csv('data/clean_tf.csv')

df = df.fillna(method='ffill')
df = df.drop('Unnamed: 0', axis=1)

rand = [random.random() for i in range(len(df))]
df['random'] = rand

In [None]:
# Checking if imported data is correct

df.head()

In [None]:
# Checking the types of the variables

df.dtypes

# Prepare Training Data

##TODO:
Make function that works with different train-test, different scaling, and different balancing
Output: Descriptive dataframe

## Data Encoding

We're encoding categorical variables to be able to use them in the models of the machine learning.


### Label Encoder

Using Label Encoder for the columns Revenue and Weekend because these columns have only two categories.

In [None]:
# Creating the instance of labelencoder
labelencoder = LabelEncoder()

# Assigning the columns
cols_label_enc = ['Revenue', 'Weekend']

# Creating labels for the columns
for col in cols_label_enc:
    df[col+'_enc'] = labelencoder.fit_transform(df[col])

### OneHotEncoder

We are using OneHotEncoder for Month, OperatingSystems, Region, TrafficType and Browser columns because they have more than two categories.

In [None]:
appended_dfs = []

columns = ['Month', 'OperatingSystems', 'Region', 'Browser', 'TrafficType']

for col in columns:
    # Creating instance of one-hot-encoder
    enc = OneHotEncoder(handle_unknown='ignore')
    # Passing columns
    enc_df = pd.DataFrame(enc.fit_transform(df[[col]]).toarray())
    # Getting values for the names of the columns
    enc_df.columns = enc.get_feature_names()+col
    # Appenign all new dfs to one list
    appended_dfs.append(enc_df)
# Mergeing with the main df
df_encoded = df.join(appended_dfs) 
df_encoded.head()

Deleting the columns that were encoded:

In [None]:
cols_drop = ['Revenue', 'Weekend', 'Month', 'OperatingSystems', 'Region', 'Browser', 'TrafficType', 'VisitorType']
df_encoded.drop(columns=cols_drop, inplace=True)

In [None]:
# Checking if the data is encoded correctly

df_encoded.head()

In [None]:
df_encoded.dtypes

### Exporting encoded data to csv

In [None]:
df_encoded.to_csv("data/df_encoded.csv", index=True)

## Data Split

Splitting data to: <br> X - independent variables <br> y - dependent variable (outcome) <br><br>Eliminating PageValues from the independent variables because it is too dependent on the outcome, it will not be possible to use it in the prediction.

In [None]:
X = df_encoded.drop(columns=['Revenue_enc', 'PageValues'])
y = df_encoded['Revenue_enc']

print(f'Original dataset shape X: {len(X)}, y: {len(y)}')
print(f'Original split between True and False:\n{y.value_counts()}')

sns.countplot(y)
plt.show()

## Data Balancing (Undersampling)

As identified in the EDA in the previous step, we have a problem of our data being imbalanced. We will use the Undersample method which is the most conveniant for our dataset.

### NearMiss

In [None]:
#### Explain why NearMiss

In [None]:
nr = NearMiss()
X, y = nr.fit_sample(X, y)

print(f'Resampled dataset shape X: {len(X)}, Y: {len(y)}')
print(f'Resampled split between True and False:\n{pd.Series(y).value_counts()}')

sns.countplot(y)
plt.show()

## Selecting test and train data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=41)

We define the train and test data in X training values and y target column.

- With shuffle, we set whether or not to shuffle the data before splitting (Default True).
- With stratify, we choose to split the data stritifying via labels (Default None).

# Scaling

Scaling allows us to standarize the numerical values of our dataset, centering to the mean.

## StandardScaler

In [None]:
std_scale = StandardScaler()

X_train = std_scale.fit_transform(X_train)

X_test = std_scale.transform(X_test)

# Modeling

## K-Nearest Neighbors

In [None]:
# Description

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neighbors = 3

knn = KNeighborsClassifier(n_neighbors = neighbors)

knn.fit(X_train, y_train)

y_pred_test = knn.predict(X_test)
y_pred_train = knn.predict(X_train)

print(f'Confussion Matrix for test data:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'\nClassification report for test data:\n{classification_report(y_test, y_pred_test)}')
knn_results = baseline_report(knn, X_train, X_test, y_train, y_test, 'KNeighborsClassifier')
knn_results

In [None]:
# Explain results

##### Hyperparameter Tuning

## Logistic Regression

In [None]:
#description

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred_test = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)

print(f'Confussion Matrix for test data:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'\nClassification report for test data:\n{classification_report(y_test, y_pred_test)}')
logreg_results = baseline_report(logreg, X_train, X_test, y_train, y_test, 'LogisticRegression')
logreg_results

In [None]:
# Explain results

##### Hyperparameter Tuning

## Decision Tree

In [None]:
# Description

In [None]:
from sklearn.tree import DecisionTreeClassifier

dectree = DecisionTreeClassifier()

dectree.fit(X_train, y_train)

y_pred_test = dectree.predict(X_test)
y_pred_train = dectree.predict(X_train)

print(f'Confussion Matrix for test data:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'\nClassification report for test data:\n{classification_report(y_test, y_pred_test)}')
dectree_results = baseline_report(dectree, X_train, X_test, y_train, y_test, 'DecisionTreeClassifier')
dectree_results

### Feature importance

In [None]:
plot_feature_importances(dectree, df_encoded)
plt.savefig('feature_importance_dectree')

In [None]:
# Explain results

##### Hyperparameter Tuning

## Random Forest

In [None]:
# Description

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfor = RandomForestClassifier()
rfor.fit(X_train, y_train)

y_pred_test = rfor.predict(X_test)
y_pred_train = rfor.predict(X_train)

print(f'Confussion Matrix for test data:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'\nClassification report for test data:\n{classification_report(y_test, y_pred_test)}')
rfor_results = baseline_report(rfor, X_train, X_test, y_train, y_test, 'RandomForestClassifier')
rfor_results

### Feature importance

We're defining a function to visualize the most important features defined by Random Forest:

In [None]:
###### ??

# Should I move this to functions?

In [None]:
def plot_feature_importances(model, df):
    names = df.columns[model.feature_importances_.argsort()]
    model.feature_importances_.sort()
    plt.figure(figsize=(15,15))
    n_features = len(names)
    plt.barh(range(n_features), np.sort(model.feature_importances_), align='center')
    plt.yticks(np.arange(n_features), names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)

In [None]:
plot_feature_importances(rfor, df_encoded)
plt.savefig('feature_importance_rfor')

In [None]:
# Explain results

# can choose to split where we see elbow or after random

#strange that anypage has such a low performance, while each page alone have higher.
#check if it's done ok, if it's not redundant info
#check what we put under browser 99 when did transformation, why other has higher importance??

#explanation of alberto, to change
"""
Similarly to the single decision tree, the random forest also gives a lot of importance to the “Glucose” feature, but it also chooses “BMI” to be the 2nd most informative feature overall. The randomness in building the random forest forces the algorithm to consider many possible explanations, the result being that the random forest captures a much broader picture of the data than a single tree."""

### Hyperparameter Tuning

In [None]:
RandomForestClassifier().get_params()

In [None]:
help(GridSearchCV)

In [None]:
# check what other options for grid have or other options for hyperparameter tuning
# Should I pass all parameters to grid or just some??

param_grid = {'n_estimators': [200, 500],
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth' : [4,5,6,7,8],
              'criterion' :['gini', 'entropy']}

grid = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs= 1)
                  
grid.fit(X_train, y_train)  

print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
# Evaluate the model with best parameters

baseline_report(grid.best_estimator_, X_train, X_test, y_train, y_test, 'RandomForestClassifier')

In [None]:
# The same cross validation done manually, check the difference with function

#Cross validation with best_estimator from grid search (using strat_k_fold, by default have kfold)

strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
initial_score = cross_val_score(grid.best_estimator_, X_test, y_test, cv=strat_k_fold, scoring='f1').mean()
print("Final accuracy : {} ".format(initial_score))

In [None]:
# miltiple scoring, should give the same as function, check the difference in settings

from sklearn.model_selection import cross_validate

initial_score = cross_validate(grid.best_estimator_, X_test, y_test, cv=strat_k_fold, scoring=('f1', 'accuracy'))
print(initial_score['test_f1'].mean())
print(initial_score['test_accuracy'].mean())

In [None]:
# info about scores: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

## Linear Support Vector Classification

In [None]:
# Description

In [None]:
from sklearn.svm import LinearSVC

linearsvc = LinearSVC()
linearsvc.fit(X_train, y_train)

y_pred_train = linearsvc.predict(X_train)
y_pred_test = linearsvc.predict(X_test)

print(f'Confussion Matrix for test data:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'\nClassification report for test data:\n{classification_report(y_test, y_pred_test)}')
linearsvc_results = baseline_report(linearsvc, X_train, X_test, y_train, y_test, 'LinearSVC')
linearsvc_results

In [None]:
# Visualizing results

sns.heatmap(confusion_matrix(y_test, y_pred_test), annot=True)
plt.show()

In [None]:
# Explain results

##### Hyperparameter Tuning

## Gaussian Naive Bayes

In [None]:
# Description

In [None]:
from sklearn.naive_bayes import GaussianNB

naive_b = GaussianNB()
naive_b.fit(X_train, y_train)

y_pred_train = naive_b.predict(X_train)
y_pred_test = naive_b.predict(X_test)

print(f'Confussion Matrix for test data:\n{confusion_matrix(y_test, y_pred_test)}')
print(f'\nClassification report for test data:\n{classification_report(y_test, y_pred_test)}')
naive_b_results = baseline_report(naive_b, X_train, X_test, y_train, y_test, 'GaussianNB')
naive_b_results

In [None]:
# Explain results

##### Hyperparameter Tuning