# Packages

In [67]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeClassifier, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# DATASET

## LOAD TRAIN

In [97]:
train =  pd.read_csv('MLUnige2023_subscriptions_train.csv', index_col='Id', na_values='na', header=0)

train = train.astype({'job':'category', 'marital':'category', 'education':'category', 'device':'category', 'outcome_old':'category', 'X1':'bool', 'X2':'bool', 'X3':'bool', 'X4':'float64', 'subscription':'category', 'day':'category', 'month':'category'})

train.day = pd.Categorical(train.day, ordered=True)

train.month = pd.Categorical(train.month, ordered=True)


## MISSING TRAIN

In [98]:
missing_values = train.isnull().mean()

print(missing_values)

age                 0.000000
job                 0.006814
marital             0.000000
education           0.042225
device              0.228999
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.000000
outcome_old         0.760054
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
subscription        0.000000
dtype: float64


## LOAD TEST

In [99]:
test = pd.read_csv('MLUnige2023_subscriptions_test.csv', index_col='Id', na_values='na', header=0)

test = test.astype({'job':'category', 'marital':'category', 'education':'category', 'device':'category', 'outcome_old':'category', 'X1':'bool', 'X2':'bool', 'X3':'bool', 'X4':'float64', 'day':'category', 'month':'category'})

test.day = pd.Categorical(test.day, ordered=True)

test.month = pd.Categorical(test.month, ordered=True)

test

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,outcome_old,X1,X2,X3,X4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,53,retired,married,university,,17,6,5.25,1,0,-1,,False,False,True,0.084570
1,61,manager,married,grad_school,smartphone,20,4,9.00,1,0,-1,,False,False,False,0.075227
2,51,industrial_worker,married,university,,4,7,9.65,1,0,-1,,False,False,False,0.075781
3,34,manager,married,grad_school,,28,5,13.45,2,0,-1,,False,False,True,0.070043
4,30,manager,married,grad_school,smartphone,4,5,16.15,2,0,-1,,False,False,False,0.171618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3832,47,industrial_worker,married,,,5,6,19.20,2,0,-1,,False,False,True,0.104216
3833,30,teacher,single,university,smartphone,3,3,3.65,1,0,-1,,True,False,False,0.077552
3834,55,teacher,married,university,,26,6,5.15,5,0,-1,,False,False,True,0.129156
3835,46,teacher,divorced,university,smartphone,13,2,5.55,1,0,-1,,False,False,False,0.093067


## MISSING TEST

In [100]:
missing_values = test.isnull().mean()

print(missing_values)

age                 0.000000
job                 0.005473
marital             0.000000
education           0.041699
device              0.234819
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.000000
outcome_old         0.765442
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
dtype: float64


## NA VALUES TRAIN

In [101]:
train['days_elapsed_old'] = train['days_elapsed_old'].replace(-1, np.nan)
train

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,outcome_old,X1,X2,X3,X4,subscription
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,28,freelance,married,grad_school,smartphone,4,2,26.80,3,4,196.0,failure,False,False,True,0.072803,1
1,48,industrial_worker,married,university,smartphone,30,4,13.05,1,1,79.0,success,False,False,False,0.075454,1
2,27,teacher,married,university,smartphone,14,7,8.10,3,0,,,False,True,True,0.068110,0
3,44,unemployed,divorced,university,smartphone,13,5,7.10,2,1,369.0,other,False,False,True,0.091942,0
4,29,manager,single,grad_school,smartphone,26,4,15.90,2,2,143.0,success,False,False,False,0.085922,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8947,54,industrial_worker,married,university,smartphone,16,7,7.30,1,0,,,True,False,False,0.072803,0
8948,43,industrial_worker,married,university,smartphone,4,2,37.75,2,0,,,False,False,True,0.081456,1
8949,27,manager,single,grad_school,,5,6,29.00,3,0,,,False,False,True,0.079186,1
8950,77,retired,divorced,grad_school,smartphone,14,4,7.55,1,0,,,False,False,False,0.115102,1


## MISSING TRAIN

In [102]:
missing_values = train.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.006814
marital             0.000000
education           0.042225
device              0.228999
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.759942
outcome_old         0.760054
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
subscription        0.000000
dtype: float64


## DROP DAY_ELAPSED_OLD AND OUTCOME_OLD TRAIN

In [103]:
train = train.drop(columns=['days_elapsed_old', 'outcome_old'])

## MISSING TRAIN

In [108]:
missing_values = train.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.006814
marital             0.000000
education           0.042225
device              0.228999
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
subscription        0.000000
dtype: float64


## NA VALUES TEST

In [104]:
test['days_elapsed_old'] = test['days_elapsed_old'].replace(-1, np.nan)

## MISSING TEST

In [105]:
missing_values = test.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.005473
marital             0.000000
education           0.041699
device              0.234819
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
days_elapsed_old    0.764921
outcome_old         0.765442
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
dtype: float64


## DROP DAYS_ELAPSED_OLD AND OUTCOME_OLD TEST


In [106]:
test = test.drop(columns=['days_elapsed_old', 'outcome_old'])

## MISSING TEST

In [87]:
missing_values = test.isnull().mean()
print(missing_values)

age                 0.000000
job                 0.005473
marital             0.000000
education           0.041699
device              0.234819
day                 0.000000
month               0.000000
time_spent          0.000000
banner_views        0.000000
banner_views_old    0.000000
X1                  0.000000
X2                  0.000000
X3                  0.000000
X4                  0.000000
dtype: float64


## CATEGORICAL VS NUMERICAL COLUMNS

In [107]:
categorical_columns = train.select_dtypes(include=["category","bool", "object"])
categorical_columns_names = list(categorical_columns.columns)
numerical_columns = train.select_dtypes(exclude=["category","bool", "object"])
numerical_columns_names = list(numerical_columns)

## DROPPING SUBSCRIPTION FOR PREDICTIONS

In [109]:
train_no_subscription = train.drop('subscription', axis=1)
train_only_subscription = train['subscription']

# TRAIN, TEST AND VALIDATION

In [120]:
valid_size = 0.4
train_size = 1-valid_size

# Split the data into training and test sets
X_train, X_valid, Y_train, Y_valid = train_test_split(train_no_subscription, train_only_subscription, test_size=valid_size)

In [125]:
print(type(X_train),type())

Unnamed: 0_level_0,age,job,marital,education,device,day,month,time_spent,banner_views,banner_views_old,X1,X2,X3,X4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8222,45,unemployed,married,grad_school,smartphone,20,11,12.70,1,0,True,False,True,0.105115
7890,24,student,single,university,smartphone,26,5,6.15,1,1,False,False,False,0.289588
3299,27,industrial_worker,single,high_school,smartphone,26,10,34.55,2,0,False,False,True,0.074910
3291,32,manager,single,grad_school,desktop,3,12,1.35,1,0,False,False,False,0.073566
7649,26,technology,single,university,,27,5,18.05,2,0,False,False,True,0.075173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,52,technology,married,university,smartphone,18,7,6.25,1,0,True,False,False,0.076281
446,35,freelance,married,grad_school,smartphone,29,1,5.50,1,0,True,False,True,0.089327
3216,27,student,single,grad_school,smartphone,21,10,40.75,1,4,False,False,False,0.083099
4651,57,housekeeper,married,grad_school,smartphone,28,10,9.85,1,1,False,False,False,0.093921


# SCALING

In [93]:
scaler = StandardScaler()
scaler.fit(train[numerical_columns_names])
train[numerical_columns_names] = pd.DataFrame(scaler.transform(train[numerical_columns_names]))
valid[numerical_columns_names] = pd.DataFrame(scaler.transform(valid[numerical_columns_names]))
test[numerical_columns_names] = pd.DataFrame(scaler.transform(test[numerical_columns_names]))

## HOT ENCODING


# PREDICTIVE MODELS

## KNN

In [94]:
# create a KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# fit the classifier to the training data
knn.fit(train, y_train)

# make predictions on the testing data
y_pred = knn.predict(train)

# calculate the accuracy of the model
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy:", accuracy)

ValueError: Cannot cast object dtype to float64