### Import required libraries

In [15]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

### Import datasets

In [16]:
train = pd.read_csv("phone_train.csv")
test = pd.read_csv("phone_validation.csv")

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 99 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tariff.plan          10000 non-null  int64  
 1   payment.method       10000 non-null  object 
 2   sex                  10000 non-null  object 
 3   age                  10000 non-null  float64
 4   activation.zone      10000 non-null  int64  
 5   activation.channel   10000 non-null  int64  
 6   vas1                 10000 non-null  object 
 7   vas2                 10000 non-null  object 
 8   q01.out.ch.peak      10000 non-null  int64  
 9   q01.out.dur.peak     10000 non-null  int64  
 10  q01.out.val.peak     10000 non-null  float64
 11  q01.out.ch.offpeak   10000 non-null  int64  
 12  q01.out.dur.offpeak  10000 non-null  int64  
 13  q01.out.val.offpeak  10000 non-null  float64
 14  q01.in.ch.tot        10000 non-null  int64  
 15  q01.in.dur.tot       10000 non-null  

In [18]:
#Checking for null values
is_null = train.isna()
is_null

Unnamed: 0,tariff.plan,payment.method,sex,age,activation.zone,activation.channel,vas1,vas2,q01.out.ch.peak,q01.out.dur.peak,...,q09.out.dur.peak,q09.out.val.peak,q09.out.ch.offpeak,q09.out.dur.offpeak,q09.out.val.offpeak,q09.in.ch.tot,q09.in.dur.tot,q09.ch.sms,q09.ch.cc,y
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
summary_stats = train.describe()

activation_zone_summary = train['activation.zone'].describe()

print(activation_zone_summary)

count    10000.000000
mean         2.071600
std          0.980495
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          4.000000
Name: activation.zone, dtype: float64


In [20]:
activation_channel_summary = train['activation.channel'].describe()
print(activation_channel_summary)

count    10000.000000
mean         5.601000
std          1.585576
min          2.000000
25%          5.000000
50%          5.000000
75%          5.000000
max          9.000000
Name: activation.channel, dtype: float64


In [21]:
column_types = train.dtypes

# Get the categorical variables
categorical_features = column_types[column_types == 'object'].index.tolist()

#categorical_numeric_features = ["activation.zone"]
#categorical_features += categorical_numeric_features

for feature in categorical_features:
    print(feature)
    print(train[feature].value_counts())

payment.method
payment.method
Carta di Credito           4779
Domiciliazione Bancaria    3635
Bollettino Postale         1586
Name: count, dtype: int64
sex
sex
B    5266
M    3704
F    1030
Name: count, dtype: int64
vas1
vas1
N    7509
Y    2491
Name: count, dtype: int64
vas2
vas2
N    9362
Y     638
Name: count, dtype: int64


In [22]:
# Removes from X the specified features
def rm_feature(X, omitted_features):
    X = X.drop(columns=omitted_features)
    return X

In [23]:
def one_hot_encoding(X):
    # One-hot encoding features
  
    # We need to decide if we want to encode also activation.zone and 
    # activation.channel, because in the test set there are entries with 
    # activation.zone = 0, while in the training set the min value for it is 1
    # Therefore, the fit on the test set won't work.

    #X[categorical_numeric_features] = X[categorical_numeric_features].astype(str)
    
    encoded_X = pd.get_dummies(X, columns=categorical_features,dtype=int)

    X_one_hot = X.drop(columns=categorical_features)

    X_one_hot = pd.concat([X_one_hot, encoded_X], axis=1)

    return X_one_hot

In [24]:
X_train = one_hot_encoding(train)
X_test = one_hot_encoding(test)

y = X_train['y']