In [24]:
# Run this code to make Jupyter print every
# printable statement and not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To visualize the data
import matplotlib.pyplot as plt

# Generic libraries
import seaborn as sns
import pandas as pd
import numpy as np

# Regression models
import sklearn
import scipy
from scipy.stats import t
import sklearn
from sklearn.model_selection import train_test_split #split the data into training and test
from sklearn.linear_model import LinearRegression #linear regression
from sklearn.preprocessing import PolynomialFeatures #for polynomial regression
from sklearn.metrics import r2_score, mean_squared_error

# Polynomial regression

Import the datasets (for this step we need only the training set)

In [75]:
online_shoppers = pd.read_csv("data/online_shoppers_intention.csv")
training_set = pd.read_csv("data/training_set_online_shoppers_intention.csv")
test_set = pd.read_csv("data/test_set_online_shoppers_intention.csv")

We set all the categorical types as category $\rightarrow$ non so se sia davvero utile o no, magari lo teniamo solo per la heatmap, CONTROLLARE!!!
Non so se sia utile per il fatto che poi tutte le categoriche diventano dummy variables, quindi in realtà 

In [76]:
online_shoppers['Month']=online_shoppers['Month'].astype('category')
online_shoppers['OperatingSystems']=online_shoppers['OperatingSystems'].astype('category')
online_shoppers['Browser']=online_shoppers['Browser'].astype('category')
online_shoppers['Region']=online_shoppers['Region'].astype('category')
online_shoppers['TrafficType']=online_shoppers['TrafficType'].astype('category')
online_shoppers['VisitorType']=online_shoppers['VisitorType'].astype('category')
online_shoppers['Weekend']=online_shoppers['Weekend'].astype('category')

Here we group the categories with the lowest number of elements into 'others'. Since these categories doesn't have a significant number of elements we don't expect them to be significant. Giving too many importance to them may lead to overfitting problems (di questa cosa non sono sicura ed è anche scritta in un inglese che fa pena)

In [77]:
# VisitorType -> others removed
training_set = training_set[training_set['VisitorType']!='Other']

#VisitorType -> 1 = returning, 0 = new
training_set['VisitorType'] = np.where(training_set['VisitorType']=='Returning_Visitor',1,0)

# Weekend
training_set['Weekend'] = np.where(training_set['Weekend']=='False',1,0)

# Split categorical variables
months = pd.get_dummies(training_set.Month, prefix='Month')
regions = pd.get_dummies(training_set.Region, prefix='Region')

def cut_levels(x, threshold, new_value):
    x = x.copy()
    value_counts = x.value_counts()
    labels = value_counts.index[value_counts < threshold]
    x[np.in1d(x, labels)] = new_value
    return x

training_set['Browser'] = cut_levels(training_set['Browser'],100,'Others')
training_set['TrafficType'] = cut_levels(training_set['TrafficType'],100,'Others')
training_set['OperatingSystems'] = cut_levels(training_set['OperatingSystems'],100,'Others')

browser = pd.get_dummies(training_set.Browser, prefix='Browser')
traffic_type = pd.get_dummies(training_set.TrafficType, prefix='TrafficType')
operating_systems = pd.get_dummies(training_set.OperatingSystems, prefix='OperatingSystems')

Here we replace the categorical features with the dummies variables obtaining binary features

In [78]:
training_set = training_set.drop(['Unnamed: 0','Month','Region','Browser','TrafficType','OperatingSystems','VisitorType'], axis=1).join([months,regions,browser,traffic_type,operating_systems])
training_set.shape

(9182, 56)

In [79]:
# split the training set to separate the rows with missing values
mask = training_set['ExitRates'].isna()
training_set_missing = training_set[mask]
#training_set_missing.head()
training_set_no_missing = training_set[mask==False]
#training_set_no_missing.head()

In [80]:
X_all_features = list(np.delete(training_set.columns.values, [np.where(training_set.columns.values=='ExitRates'),np.where(training_set.columns.values=='Revenue')], axis=None))
X = training_set_no_missing[X_all_features].to_numpy()
y = training_set_no_missing['ExitRates'].to_numpy()

Now we split the training data to obtain a train and a test set to train the regression model and to compute the performance scores.

In [81]:
#splitting data
test_size = 0.3
test_seed = 40
# Split X and y into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=test_seed)
print("X_train.shape ", X_train.shape)
print("X_test.shape ", X_test.shape)

X_train.shape  (4492, 54)
X_test.shape  (1926, 54)


In [82]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X, y,
                                                                test_size=test_size, 
                                                                random_state=test_seed)

Now we fit different polynomial regression models

In [90]:
def polynomial_regression(deg):
    # transform each feature in polynomial feature
    poly = PolynomialFeatures(degree=deg, include_bias=False)
    poly = poly.fit(X_train_small)

    # Applies the transformation
    train_poly = poly.transform(X_train_small)
    test_poly = poly.transform(X_test_small)

    print("X_train_small.shape", X_train_small.shape, " X_poly_train.shape", train_poly.shape)
    print("X_test_small.shape", X_test_small.shape, " X_poly_test.shape", test_poly.shape)

    model = LinearRegression(fit_intercept=True)
    model = model.fit(train_poly, y_train_small)
    y_predict = model.predict(test_poly)
    return model, y_predict

In [92]:
models = []
y_predict = []
for i in range(1,3):
    mod, pred = polynomial_regression(i)
    models.append(mod)
    y_predict.append(pred)

X_train_small.shape (4492, 54)  X_poly_train.shape (4492, 54)
X_test_small.shape (1926, 54)  X_poly_test.shape (1926, 54)
X_train_small.shape (4492, 54)  X_poly_train.shape (4492, 1539)
X_test_small.shape (1926, 54)  X_poly_test.shape (1926, 1539)


Model evaluations: forse queste metriche non hanno senso per confrontare i nostri modelli visto che hanno un numero di variabili differenti, ma sono comunque utili per capire se i nostri modelli sono buoni o no. Esempio R2 negativo indica che il nostro modello non va bene.

In [94]:
for prediction in y_predict:
    print("R2 score ",r2_score(y_test_small, prediction))
    print("MSE score ",mean_squared_error(y_test_small, prediction))

R2 score  0.8742184860581035
MSE score  0.0003103481929149338
R2 score  0.7963455429548729
MSE score  0.0005024887262227065
