# Packages

In [108]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeClassifier, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.compose import ColumnTransformer 


# DATASET

## LOAD TRAIN

In [109]:
train =  pd.read_csv('MLUnige2023_subscriptions_train.csv', index_col='Id', na_values='na', header=0)

train = train.astype({'job':'category', 'marital':'category', 'education':'category', 'device':'category', 'outcome_old':'category', 'X1':'bool', 'X2':'bool', 'X3':'bool', 'X4':'float64', 'subscription':'category', 'day':'category', 'month':'category'})

train.day = pd.Categorical(train.day, ordered=True)

train.month = pd.Categorical(train.month, ordered=True)

train


Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,outcome_old,X1,X2,X3,X4,subscription
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,28,freelance,married,grad_school,smartphone,4,2,26.80,3,4,196,failure,False,False,True,0.072803,1
1,48,industrial_worker,married,university,smartphone,30,4,13.05,1,1,79,success,False,False,False,0.075454,1
2,27,teacher,married,university,smartphone,14,7,8.10,3,0,-1,,False,True,True,0.068110,0
3,44,unemployed,divorced,university,smartphone,13,5,7.10,2,1,369,other,False,False,True,0.091942,0
4,29,manager,single,grad_school,smartphone,26,4,15.90,2,2,143,success,False,False,False,0.085922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8947,54,industrial_worker,married,university,smartphone,16,7,7.30,1,0,-1,,True,False,False,0.072803,0
8948,43,industrial_worker,married,university,smartphone,4,2,37.75,2,0,-1,,False,False,True,0.081456,1
8949,27,manager,single,grad_school,,5,6,29.00,3,0,-1,,False,False,True,0.079186,1
8950,77,retired,divorced,grad_school,smartphone,14,4,7.55,1,0,-1,,False,False,False,0.115102,1


In [110]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8952 entries, 0 to 8951
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   age               8952 non-null   int64   
 1   job               8891 non-null   category
 2   marital           8952 non-null   category
 3   education         8574 non-null   category
 4   device            6902 non-null   category
 5   day               8952 non-null   category
 6   month             8952 non-null   category
 7   time_spent        8952 non-null   float64 
 8   banner_views      8952 non-null   int64   
 9   banner_views_old  8952 non-null   int64   
 10  days_elapsed_old  8952 non-null   int64   
 11  outcome_old       2148 non-null   category
 12  X1                8952 non-null   bool    
 13  X2                8952 non-null   bool    
 14  X3                8952 non-null   bool    
 15  X4                8952 non-null   float64 
 16  subscription      8952 n

## MISSING TRAIN

In [111]:
missing_values = train.isnull().mean()

print(missing_values)

age                 0.000000
job                 0.006814
marital             0.000000
education           0.042225
device              0.228999
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.000000
outcome_old         0.760054
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
subscription        0.000000
dtype: float64


## LOAD TEST

In [112]:
test = pd.read_csv('MLUnige2023_subscriptions_test.csv', index_col='Id', na_values='na', header=0)

test = test.astype({'job':'category', 'marital':'category', 'education':'category', 'device':'category', 'outcome_old':'category', 'X1':'bool', 'X2':'bool', 'X3':'bool', 'X4':'float64', 'day':'category', 'month':'category'})

test.day = pd.Categorical(test.day, ordered=True)

test.month = pd.Categorical(test.month, ordered=True)

test

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,outcome_old,X1,X2,X3,X4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,53,retired,married,university,,17,6,5.25,1,0,-1,,False,False,True,0.084570
1,61,manager,married,grad_school,smartphone,20,4,9.00,1,0,-1,,False,False,False,0.075227
2,51,industrial_worker,married,university,,4,7,9.65,1,0,-1,,False,False,False,0.075781
3,34,manager,married,grad_school,,28,5,13.45,2,0,-1,,False,False,True,0.070043
4,30,manager,married,grad_school,smartphone,4,5,16.15,2,0,-1,,False,False,False,0.171618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3832,47,industrial_worker,married,,,5,6,19.20,2,0,-1,,False,False,True,0.104216
3833,30,teacher,single,university,smartphone,3,3,3.65,1,0,-1,,True,False,False,0.077552
3834,55,teacher,married,university,,26,6,5.15,5,0,-1,,False,False,True,0.129156
3835,46,teacher,divorced,university,smartphone,13,2,5.55,1,0,-1,,False,False,False,0.093067


In [113]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3837 entries, 0 to 3836
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   age               3837 non-null   int64   
 1   job               3816 non-null   category
 2   marital           3837 non-null   category
 3   education         3677 non-null   category
 4   device            2936 non-null   category
 5   day               3837 non-null   category
 6   month             3837 non-null   category
 7   time_spent        3837 non-null   float64 
 8   banner_views      3837 non-null   int64   
 9   banner_views_old  3837 non-null   int64   
 10  days_elapsed_old  3837 non-null   int64   
 11  outcome_old       900 non-null    category
 12  X1                3837 non-null   bool    
 13  X2                3837 non-null   bool    
 14  X3                3837 non-null   bool    
 15  X4                3837 non-null   float64 
dtypes: bool(3), category(7),

## MISSING TEST

In [114]:
missing_values = test.isnull().mean()

print(missing_values)

age                 0.000000
job                 0.005473
marital             0.000000
education           0.041699
device              0.234819
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.000000
outcome_old         0.765442
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
dtype: float64


## NA VALUES TRAIN

In [115]:
train['days_elapsed_old'] = train['days_elapsed_old'].replace(-1, np.nan)
train

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,outcome_old,X1,X2,X3,X4,subscription
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,28,freelance,married,grad_school,smartphone,4,2,26.80,3,4,196.0,failure,False,False,True,0.072803,1
1,48,industrial_worker,married,university,smartphone,30,4,13.05,1,1,79.0,success,False,False,False,0.075454,1
2,27,teacher,married,university,smartphone,14,7,8.10,3,0,,,False,True,True,0.068110,0
3,44,unemployed,divorced,university,smartphone,13,5,7.10,2,1,369.0,other,False,False,True,0.091942,0
4,29,manager,single,grad_school,smartphone,26,4,15.90,2,2,143.0,success,False,False,False,0.085922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8947,54,industrial_worker,married,university,smartphone,16,7,7.30,1,0,,,True,False,False,0.072803,0
8948,43,industrial_worker,married,university,smartphone,4,2,37.75,2,0,,,False,False,True,0.081456,1
8949,27,manager,single,grad_school,,5,6,29.00,3,0,,,False,False,True,0.079186,1
8950,77,retired,divorced,grad_school,smartphone,14,4,7.55,1,0,,,False,False,False,0.115102,1


In [116]:
train['X1'] = train['X1'].astype(int)
train['X2'] = train['X2'].astype(int)
train['X3'] = train['X3'].astype(int)

train = train.astype({'job':'category', 'marital':'category', 'education':'category', 'device':'category', 'X1':'category', 'X2':'category', 'X3':'category', 'X4':'float64', 'subscription':'category', 'day':'category', 'month':'category'})

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8952 entries, 0 to 8951
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   age               8952 non-null   int64   
 1   job               8891 non-null   category
 2   marital           8952 non-null   category
 3   education         8574 non-null   category
 4   device            6902 non-null   category
 5   day               8952 non-null   category
 6   month             8952 non-null   category
 7   time_spent        8952 non-null   float64 
 8   banner_views      8952 non-null   int64   
 9   banner_views_old  8952 non-null   int64   
 10  days_elapsed_old  2149 non-null   float64 
 11  outcome_old       2148 non-null   category
 12  X1                8952 non-null   category
 13  X2                8952 non-null   category
 14  X3                8952 non-null   category
 15  X4                8952 non-null   float64 
 16  subscription      8952 n

In [117]:
train

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,outcome_old,X1,X2,X3,X4,subscription
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,28,freelance,married,grad_school,smartphone,4,2,26.80,3,4,196.0,failure,0,0,1,0.072803,1
1,48,industrial_worker,married,university,smartphone,30,4,13.05,1,1,79.0,success,0,0,0,0.075454,1
2,27,teacher,married,university,smartphone,14,7,8.10,3,0,,,0,1,1,0.068110,0
3,44,unemployed,divorced,university,smartphone,13,5,7.10,2,1,369.0,other,0,0,1,0.091942,0
4,29,manager,single,grad_school,smartphone,26,4,15.90,2,2,143.0,success,0,0,0,0.085922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8947,54,industrial_worker,married,university,smartphone,16,7,7.30,1,0,,,1,0,0,0.072803,0
8948,43,industrial_worker,married,university,smartphone,4,2,37.75,2,0,,,0,0,1,0.081456,1
8949,27,manager,single,grad_school,,5,6,29.00,3,0,,,0,0,1,0.079186,1
8950,77,retired,divorced,grad_school,smartphone,14,4,7.55,1,0,,,0,0,0,0.115102,1


## MISSING TRAIN

In [118]:
missing_values = train.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.006814
marital             0.000000
education           0.042225
device              0.228999
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.759942
outcome_old         0.760054
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
subscription        0.000000
dtype: float64


## DROP DAY_ELAPSED_OLD AND OUTCOME_OLD TRAIN

In [119]:
train = train.drop(columns=['days_elapsed_old', 'outcome_old'])

## MISSING TRAIN

In [120]:
missing_values = train.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.006814
marital             0.000000
education           0.042225
device              0.228999
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
subscription        0.000000
dtype: float64


## NA VALUES TEST

In [121]:
test['days_elapsed_old'] = test['days_elapsed_old'].replace(-1, np.nan)

In [123]:
test['X1'] = test['X1'].astype(int)
test['X2'] = test['X2'].astype(int)
test['X3'] = test['X3'].astype(int)

test = test.astype({'job':'category', 'marital':'category', 'education':'category', 'device':'category', 'X1':'category', 'X2':'category', 'X3':'category', 'X4':'float64', 'day':'category', 'month':'category'})

test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3837 entries, 0 to 3836
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   age               3837 non-null   int64   
 1   job               3816 non-null   category
 2   marital           3837 non-null   category
 3   education         3677 non-null   category
 4   device            2936 non-null   category
 5   day               3837 non-null   category
 6   month             3837 non-null   category
 7   time_spent        3837 non-null   float64 
 8   banner_views      3837 non-null   int64   
 9   banner_views_old  3837 non-null   int64   
 10  days_elapsed_old  902 non-null    float64 
 11  outcome_old       900 non-null    category
 12  X1                3837 non-null   category
 13  X2                3837 non-null   category
 14  X3                3837 non-null   category
 15  X4                3837 non-null   float64 
dtypes: category(10), float64

In [124]:
test

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,outcome_old,X1,X2,X3,X4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,53,retired,married,university,,17,6,5.25,1,0,,,0,0,1,0.084570
1,61,manager,married,grad_school,smartphone,20,4,9.00,1,0,,,0,0,0,0.075227
2,51,industrial_worker,married,university,,4,7,9.65,1,0,,,0,0,0,0.075781
3,34,manager,married,grad_school,,28,5,13.45,2,0,,,0,0,1,0.070043
4,30,manager,married,grad_school,smartphone,4,5,16.15,2,0,,,0,0,0,0.171618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3832,47,industrial_worker,married,,,5,6,19.20,2,0,,,0,0,1,0.104216
3833,30,teacher,single,university,smartphone,3,3,3.65,1,0,,,1,0,0,0.077552
3834,55,teacher,married,university,,26,6,5.15,5,0,,,0,0,1,0.129156
3835,46,teacher,divorced,university,smartphone,13,2,5.55,1,0,,,0,0,0,0.093067


## MISSING TEST

In [125]:
missing_values = test.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.005473
marital             0.000000
education           0.041699
device              0.234819
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.764921
outcome_old         0.765442
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
dtype: float64


## DROP DAYS_ELAPSED_OLD AND OUTCOME_OLD TEST


In [126]:
test = test.drop(columns=['days_elapsed_old', 'outcome_old'])

## MISSING TEST

In [127]:
missing_values = test.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.005473
marital             0.000000
education           0.041699
device              0.234819
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
dtype: float64


## CATEGORICAL VS NUMERICAL COLUMNS

In [128]:
categorical_columns = train.select_dtypes(include=["category","bool", "object"])
categorical_columns_names = list(categorical_columns.columns)
numerical_columns = train.select_dtypes(exclude=["category","bool", "object"])
numerical_columns_names = list(numerical_columns)

print("Numerical:",numerical_columns_names, "\nCategory:",categorical_columns_names)

Numerical: ['age', 'time_spent', 'banner_views', 'banner_views_old', 'X4'] 
Category: ['job', 'marital', 'education', 'device', 'day', 'month', 'X1', 'X2', 'X3', 'subscription']


## REMOVING NA

In [129]:
train = train.dropna()
test = test.dropna()

In [130]:
train

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,X1,X2,X3,X4,subscription
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,28,freelance,married,grad_school,smartphone,4,2,26.80,3,4,0,0,1,0.072803,1
1,48,industrial_worker,married,university,smartphone,30,4,13.05,1,1,0,0,0,0.075454,1
2,27,teacher,married,university,smartphone,14,7,8.10,3,0,0,1,1,0.068110,0
3,44,unemployed,divorced,university,smartphone,13,5,7.10,2,1,0,0,1,0.091942,0
4,29,manager,single,grad_school,smartphone,26,4,15.90,2,2,0,0,0,0.085922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8946,32,unemployed,single,university,smartphone,2,8,23.80,1,2,0,0,0,0.148194,0
8947,54,industrial_worker,married,university,smartphone,16,7,7.30,1,0,1,0,0,0.072803,0
8948,43,industrial_worker,married,university,smartphone,4,2,37.75,2,0,0,0,1,0.081456,1
8950,77,retired,divorced,grad_school,smartphone,14,4,7.55,1,0,0,0,0,0.115102,1


In [131]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6607 entries, 0 to 8951
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   age               6607 non-null   int64   
 1   job               6607 non-null   category
 2   marital           6607 non-null   category
 3   education         6607 non-null   category
 4   device            6607 non-null   category
 5   day               6607 non-null   category
 6   month             6607 non-null   category
 7   time_spent        6607 non-null   float64 
 8   banner_views      6607 non-null   int64   
 9   banner_views_old  6607 non-null   int64   
 10  X1                6607 non-null   category
 11  X2                6607 non-null   category
 12  X3                6607 non-null   category
 13  X4                6607 non-null   float64 
 14  subscription      6607 non-null   category
dtypes: category(10), float64(2), int64(3)
memory usage: 377.1 KB


In [132]:
missing_values = train.isnull().mean()
print(missing_values)

age                 0.0
job                 0.0
marital             0.0
education           0.0
device              0.0
day                 0.0
month               0.0
time_spent          0.0
banner_views        0.0
banner_views_old    0.0
X1                  0.0
X2                  0.0
X3                  0.0
X4                  0.0
subscription        0.0
dtype: float64


In [133]:
test

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,X1,X2,X3,X4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,61,manager,married,grad_school,smartphone,20,4,9.00,1,0,0,0,0,0.075227
4,30,manager,married,grad_school,smartphone,4,5,16.15,2,0,0,0,0,0.171618
5,25,technology,single,grad_school,desktop,9,6,10.55,7,0,0,0,0,0.105760
6,32,freelance,married,university,smartphone,9,3,10.35,2,0,0,0,0,0.076517
7,41,unemployed,married,grad_school,smartphone,30,10,18.15,3,0,0,0,0,0.072803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3830,36,technology,single,university,smartphone,22,8,14.95,3,0,0,0,0,0.078832
3831,46,technology,married,grad_school,smartphone,20,11,26.05,1,1,0,0,0,0.072803
3833,30,teacher,single,university,smartphone,3,3,3.65,1,0,1,0,0,0.077552
3835,46,teacher,divorced,university,smartphone,13,2,5.55,1,0,0,0,0,0.093067


In [134]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2823 entries, 1 to 3836
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   age               2823 non-null   int64   
 1   job               2823 non-null   category
 2   marital           2823 non-null   category
 3   education         2823 non-null   category
 4   device            2823 non-null   category
 5   day               2823 non-null   category
 6   month             2823 non-null   category
 7   time_spent        2823 non-null   float64 
 8   banner_views      2823 non-null   int64   
 9   banner_views_old  2823 non-null   int64   
 10  X1                2823 non-null   category
 11  X2                2823 non-null   category
 12  X3                2823 non-null   category
 13  X4                2823 non-null   float64 
dtypes: category(9), float64(2), int64(3)
memory usage: 159.9 KB


In [135]:
missing_values = test.isnull().mean()
print(missing_values)

age                 0.0
job                 0.0
marital             0.0
education           0.0
device              0.0
day                 0.0
month               0.0
time_spent          0.0
banner_views        0.0
banner_views_old    0.0
X1                  0.0
X2                  0.0
X3                  0.0
X4                  0.0
dtype: float64


# SCALING

In [69]:

transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(), [0, 1])])
# transform training data
train_X = transformer.fit_transform(train_X)

Unnamed: 0,age,time_spent,banner_views,banner_views_old,X4
0,-1.099928,0.572138,0.203390,1.276111,-0.450964
1,0.555193,-0.254284,-0.571357,0.006597,-0.368693
2,-1.182684,-0.551796,0.203390,-0.416574,-0.596628
3,0.224169,-0.611899,-0.183983,0.006597,0.142962
4,-1.017172,-0.082989,-0.183983,0.429768,-0.043837
...,...,...,...,...,...
6602,-0.768904,0.391828,-0.571357,0.429768,1.888677
6603,1.051730,-0.599878,-0.571357,-0.416574,-0.450964
6604,0.141413,1.230271,-0.183983,-0.416574,-0.182457
6605,2.955119,-0.584853,-0.571357,-0.416574,0.861703


In [70]:
train.select_dtypes(include=['category','bool'])

Unnamed: 0_level_0,job,marital,education,device,day,month,X1,X2,X3,subscription
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,freelance,married,grad_school,smartphone,4,2,False,False,True,1
1,industrial_worker,married,university,smartphone,30,4,False,False,False,1
2,teacher,married,university,smartphone,14,7,False,True,True,0
3,unemployed,divorced,university,smartphone,13,5,False,False,True,0
4,manager,single,grad_school,smartphone,26,4,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...
8946,unemployed,single,university,smartphone,2,8,False,False,False,0
8947,industrial_worker,married,university,smartphone,16,7,True,False,False,0
8948,industrial_worker,married,university,smartphone,4,2,False,False,True,1
8950,retired,divorced,grad_school,smartphone,14,4,False,False,False,1


In [57]:
missing_values = train_scaled.isnull().mean()
print(missing_values)

age                 0.209311
time_spent          0.209311
banner_views        0.209311
banner_views_old    0.209311
X4                  0.209311
job                 0.209311
marital             0.209311
education           0.209311
device              0.209311
day                 0.209311
month               0.209311
X1                  0.209311
X2                  0.209311
X3                  0.209311
subscription        0.209311
dtype: float64


## HOT-ENCODING

In [195]:
train_hot = pd.get_dummies(train, columns=['job','marital','education','device','day','month'])

In [196]:
train_hot

Unnamed: 0_level_0,age,time_spent,banner_views,banner_views_old,X1,X2,X3,X4,subscription,job_entrepreneur,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.109109,0.577404,0.190902,1.245632,False,False,True,-0.492444,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0.548473,-0.252950,-0.560334,0.009500,False,False,False,-0.401299,1,0,...,0,1,0,0,0,0,0,0,0,0
2,-1.191988,-0.551877,0.190902,-0.402545,False,True,True,-0.653820,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.216956,-0.612266,-0.184716,0.009500,False,False,True,0.165546,0,0,...,0,0,1,0,0,0,0,0,0,0
4,-1.026230,-0.080840,-0.184716,0.421544,False,False,False,-0.041402,1,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8946,,,,,False,False,False,,0,0,...,0,0,0,0,0,1,0,0,0,0
8947,,,,,True,False,False,,0,0,...,0,0,0,0,1,0,0,0,0,0
8948,,,,,False,False,True,,1,0,...,0,0,0,0,0,0,0,0,0,0
8950,,,,,False,False,False,,1,0,...,0,1,0,0,0,0,0,0,0,0


## DROPPING SUBSCRIPTION FOR PREDICTIONS

In [142]:
train_no_subscription = train.drop('subscription', axis=1)
train_only_subscription = train['subscription']

# TRAIN, TEST AND VALIDATION

In [143]:
valid_size = 0.4
train_size = 1-valid_size

# Split the data into training and test sets
X_train, X_valid, Y_train, Y_valid = train_test_split(train_no_subscription, train_only_subscription, test_size=valid_size)

In [144]:
Cond1 = X_train.shape[0] == Y_train.shape[0]

Cond2 = X_valid.shape[0] == Y_valid.shape[0]

print("\nSame Size ?", "-> ","For Training:",Cond1,", For Validation:", Cond2)

PropTV = Y_valid.shape[0]/Y_train.shape[0]*100

print("\nProportions between Training and Validation:", round(PropTV,2),"%")


Same Size ? ->  For Training: True , For Validation: True

Proportions between Training and Validation: 66.71 %


# PREDICTIVE MODELS

## DECISION TREES

In [145]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,Y_train)

#Predict the response for test dataset
TREES_PRED = clf.predict(X_valid)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(Y_valid, TREES_PRED))

ValueError: Cannot cast object dtype to float32