#  **Prediction PART**

**At this stages, we developped three ML models to predict whether a visitor to the e-commerce page will make a purchase or not. 
Then, this model can help to find the right consumer - who have the intention to purchase - analyze the data of potential consumer in real time and push direct marketing strategies to all those people.**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from matplotlib import pyplot
from sklearn.metrics import classification_report
df = pd.read_csv("../input/online-shoppers-intention/online_shoppers_intention.csv")

**Preprocessing**
> We will first transform the categorical variables into a numeric variable then split our data into train and test parts. Then we will plot the repartition of the two classes in the df


In [None]:
def preprocessing(df,columns):
    df.dropna(inplace=True) 
    df.replace(['Returning_Visitor', 'New_Visitor','Other'],[1,2,3], inplace=True)
    df.replace(['Jan','Feb','Mar','Apr','May','June','Jul','Aug','Sep','Oct','Nov','Dec'],[1,2,3,4,5,6,7,8,9,10,11,12],inplace=True)
    if columns!=[]: 
        df=df[columns]
    return df

def Split(df):
    X = df.drop('Revenue', axis=1)
    y = df['Revenue']
    obj_escalar = StandardScaler()
    X_standardization = obj_escalar.fit_transform(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
    return X_train, X_test, Y_train, Y_test
 
df=preprocessing(df,[])
X_train, X_test, Y_train, Y_test=Split(df)
df['Revenue'].value_counts().plot.bar()


> We observe that the data are imbalanced. There is a lot more 'Revenue = False' observations than 'Revenue = True'. We will therefore have to take it into account in our models.

# > **BASELINE LOGISTIC REGRESSION**

> * The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).
> * Cfloat, default=1.0 Inverse of regularization strength; must be a positive float --> smaller values specify stronger regularization.

In [None]:
model1 = LogisticRegression(class_weight='balanced')
model1.fit(X_train, Y_train)
y_pred=model1.predict(X_test)
print("\nAccuracy\t{}".format(round(metrics.accuracy_score(Y_test, y_pred),3)))  
print("\nRecall\t{}".format(round(metrics.recall_score(Y_test, y_pred),3)))  
model1.get_params()


Find Best parameters using Grid Search

In [None]:
parametres = {"C": [0.001, 0.008, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06,0.07, 0.1]}
model1_gs = GridSearchCV(model1, param_grid=parametres,
                         cv = 10, scoring='accuracy')
model1_gs.fit(X_train, Y_train)
print(model1_gs.best_params_, "\nAcc: {}".format(round(model1_gs.best_score_,3)))

In [None]:
df_search = pd.DataFrame.from_dict(model1_gs.cv_results_)
plt.xlabel('C')
plt.ylabel('Acc')
_ = plt.plot(df_search['param_C'], df_search['mean_test_score'], 'x')
print('Best_parameter',model1_gs.best_params_, "\nAcc: {}".format(round(model1_gs.best_score_,3)))

Find Best threshold

In [None]:
model1 = LogisticRegression(class_weight='balanced', C=model1_gs.best_params_.get("C"))
model1.fit(X_train, Y_train)
y_test_pred_prob = model1.predict_proba(X_test)
y_test_pred_prob_pos = y_test_pred_prob[np.where(Y_test == 1)[0]]
y_test_pred_prob_neg = y_test_pred_prob[np.where(Y_test == 0)[0]]

def representation_seuil(x_1, x_0, n_bins=11, title='This figure represents in blue the probabilities assigned by the model to data that are 1, and in red the probabilities assigned to the data that are 0', label_1='Clase 1', 
                          label_0='Clase 0', density=0):
    bins = n_bins
    plt.hist(x_1, bins, density = density, alpha=0.5, label=label_1, color='blue')    
    plt.hist(x_0, bins, density = density, alpha=0.5, label=label_0, color='red')
    plt.title(title)
    plt.legend(loc='best') 
    
representation_seuil(y_test_pred_prob_pos[:, 1], y_test_pred_prob_neg[:, 1], n_bins=21, density=0)    


> Because the sensitivity represents the percentage of true positive on all the positives values, it is a data which is important for our cases since we need to find which visitors have the intention to buy (Revenue=True)

In [None]:
THRESHOLD = 0.5
y_THRESHOLD = 1*(y_test_pred_prob[:, 1] > THRESHOLD)

print(u"Matriz de confusión\n", metrics.confusion_matrix(Y_test, y_THRESHOLD))
print("\nAccuracy\t{}".format(round(metrics.accuracy_score(Y_test, y_THRESHOLD),3)))  
print("Sensitivity\t{}".format(round(metrics.recall_score(Y_test, y_THRESHOLD),3)))
print(u"Precision\t{}".format(round(metrics.precision_score(Y_test, y_THRESHOLD),3))) 

In [None]:
THRESHOLD = 0.6
y_THRESHOLD = 1*(y_test_pred_prob[:, 1] > THRESHOLD)

print(u"Matrice de confusion\n", metrics.confusion_matrix(Y_test, y_THRESHOLD))
print("\nAccuracy\t{}".format(round(metrics.accuracy_score(Y_test, y_THRESHOLD),3)))  
print("Sensitivity\t{}".format(round(metrics.recall_score(Y_test, y_THRESHOLD),3)))
print(u"Precision\t{}".format(round(metrics.precision_score(Y_test, y_THRESHOLD),3))) 

In [None]:
THRESHOLD = 0.55
y_THRESHOLD = 1*(y_test_pred_prob[:, 1] > THRESHOLD)

print(u"Matrice de confusion\n", metrics.confusion_matrix(Y_test, y_THRESHOLD))
print("\nAccuracy\t{}".format(round(metrics.accuracy_score(Y_test, y_THRESHOLD),3)))  
print("Sensitivity\t{}".format(round(metrics.recall_score(Y_test, y_THRESHOLD),3)))
print(u"Precision\t{}".format(round(metrics.precision_score(Y_test, y_THRESHOLD),3)))

> We will keep the threshold of 0.55 which allows to have a good trade-off between accuracy and recall

**Features importance**
> We want to observe which variables contribute the most to the model

In [None]:
importance = model1.coef_[0]
important_features=[]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature:',df.columns[i],'Score:', v)
    if (v>0.04 or v<-0.04):
        important_features.append(df.columns[i])
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

important_features

 > We are keeping features that score> 0.04 or <-0.04 to see if our model's performance will improve.

In [None]:
df = pd.read_csv("../input/online-shoppers-intention/online_shoppers_intention.csv")
columns=important_features+['Revenue']
df_less_features=preprocessing(df,columns)
X_train, X_test, Y_train, Y_test=Split(df_less_features)

In [None]:
model1 = LogisticRegression(class_weight='balanced')
parametres = {"C": [0.001, 0.008, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06,0.07, 0.1]}
model1_gs = GridSearchCV(model1, param_grid=parametres,
                         cv = 10, scoring='accuracy')
model1_gs.fit(X_train, Y_train)
print(model1_gs.best_params_, "\nAcc: {}".format(round(model1_gs.best_score_,3)))


In [None]:
model1 = LogisticRegression(class_weight='balanced', C=model1_gs.best_params_.get("C"))
model1.fit(X_train, Y_train)
y_test_pred_prob = model1.predict_proba(X_test)
y_test_pred_prob_pos = y_test_pred_prob[np.where(Y_test == 1)[0]]
y_test_pred_prob_neg = y_test_pred_prob[np.where(Y_test == 0)[0]]

def representation_seuil(x_1, x_0, n_bins=11, title='', label_1='Classe 1', 
                          label_0='Classe 0', density=0):
    bins = n_bins
    plt.hist(x_1, bins, density = density, alpha=0.5, label=label_1, color='blue')    
    plt.hist(x_0, bins, density = density, alpha=0.5, label=label_0, color='red')
    plt.title(title)
    plt.legend(loc='best') 
    
representation_seuil(y_test_pred_prob_pos[:, 1], y_test_pred_prob_neg[:, 1], n_bins=21, density=0)    



In [None]:
THRESHOLD = 0.6
y_threshold = 1*(y_test_pred_prob[:, 1] > THRESHOLD)

print(u"Matriz de confusion\n", metrics.confusion_matrix(Y_test, y_threshold))
print("\nAccuracy\t{}".format(round(metrics.accuracy_score(Y_test, y_threshold),3)))  
print("Sensitivity\t{}".format(round(metrics.recall_score(Y_test, y_threshold),3)))
print(u"Precision\t{}".format(round(metrics.precision_score(Y_test, y_threshold),3)))

In [None]:
THRESHOLD = 0.55
y_threshold = 1*(y_test_pred_prob[:, 1] > THRESHOLD)

print(u"Matriz de confusion\n", metrics.confusion_matrix(Y_test, y_threshold))
print("\nAccuracy\t{}".format(round(metrics.accuracy_score(Y_test, y_threshold),3)))  
print("Sensitivity\t{}".format(round(metrics.recall_score(Y_test, y_threshold),3)))
print(u"Precision\t{}".format(round(metrics.precision_score(Y_test, y_threshold),3)))

In [None]:
THRESHOLD = 0.5
y_threshold = 1*(y_test_pred_prob[:, 1] > THRESHOLD)

print(u"Matriz de confusion\n", metrics.confusion_matrix(Y_test, y_threshold))
print("\nAccuracy\t{}".format(round(metrics.accuracy_score(Y_test, y_threshold),3)))  
print("Sensitivity\t{}".format(round(metrics.recall_score(Y_test, y_threshold),3)))
print(u"Precision\t{}".format(round(metrics.precision_score(Y_test, y_threshold),3)))

**Results**
* We will keep the threshold of 0.5 which allows us to have the best deal between accuracy and recall.
* We can observe that when we relaunch the model with only the most important features, we get better results. The other variables therefore made noise in the previous model. 
* We thus obtain in our best logistic regression model: 0,89 accuracy, 0,70 recall.

# > > MODEL 2 RANDOM FOREST

In [None]:
model2 = RandomForestClassifier(n_estimators = 30,max_depth = 10,random_state = 101)
model2.fit(X_train,Y_train)
pred = model2.predict(X_test)
print('Results Random Forest with no optimization')
print(classification_report(Y_test,pred))
print(model2.score(X_test,Y_test))

Optimization of the Random Forest Classifier using GridSearch

In [None]:
 param_grid = {
    'n_estimators' : [60,100],
    'max_depth' : [10,15],
    'min_samples_leaf' : [2,4],
    'min_samples_split': [2,4]
}

gridsearch = GridSearchCV(estimator=model2,param_grid=param_grid,verbose = 1)
gridsearch.fit(X_train,Y_train)
gridsearch.best_params_


In [None]:
model2 = RandomForestClassifier(n_estimators = 60,max_depth = 10,min_samples_leaf = 3, min_samples_split = 2,random_state = 101)
model2.fit(X_train,Y_train)
pred = model2.predict(X_test)
print(classification_report(Y_test,pred))

from sklearn.metrics import accuracy_score
Acc = accuracy_score(Y_test,pred)
print('Accuracy',Acc)

**Results**
* We thus obtain in our best logistic regression model: 0.91 accuracy, 0.62 recall.
* The logistic regression is finally a better model than random forest to predict shopping intentions