In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_columns = None
pd.options.display.max_rows = 150

In [3]:
data_path_test = ("test.csv")
data_path_train = ("train.csv")

In [4]:
df_test=pd.read_csv(data_path_test)
df_train=pd.read_csv(data_path_train)
arr_df = [df_test,df_train]

## Visualizzo informazioni sul dataset

In [5]:
df_test.isnull().sum()

Unnamed: 0        0
No-show           0
PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
dtype: int64

In [6]:
df_test.isna().sum()

Unnamed: 0        0
No-show           0
PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
dtype: int64

In [7]:
for col in df_test.columns:
    print(col,df_test[col].value_counts(normalize=True),"\n")

Unnamed: 0 2047     0.000045
66309    0.000045
39431    0.000045
29435    0.000045
19196    0.000045
           ...   
85379    0.000045
55093    0.000045
60378    0.000045
74300    0.000045
2049     0.000045
Name: Unnamed: 0, Length: 22106, dtype: float64 

No-show No     0.801954
Yes    0.198046
Name: No-show, dtype: float64 

PatientId 6.264199e+12    0.000859
9.963767e+10    0.000769
8.221459e+14    0.000724
8.722785e+11    0.000724
2.688613e+13    0.000633
                  ...   
4.233367e+11    0.000045
6.464137e+12    0.000045
4.477584e+14    0.000045
3.197248e+13    0.000045
7.314338e+13    0.000045
Name: PatientId, Length: 18734, dtype: float64 

AppointmentID 5638143    0.000045
5636933    0.000045
5678083    0.000045
5708598    0.000045
5641015    0.000045
             ...   
5692849    0.000045
5715835    0.000045
5623219    0.000045
5672310    0.000045
5670913    0.000045
Name: AppointmentID, Length: 22106, dtype: float64 

Gender F    0.655297
M    0.344703
Name: Gender,

In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22106 entries, 0 to 22105
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      22106 non-null  int64  
 1   No-show         22106 non-null  object 
 2   PatientId       22106 non-null  float64
 3   AppointmentID   22106 non-null  int64  
 4   Gender          22106 non-null  object 
 5   ScheduledDay    22106 non-null  object 
 6   AppointmentDay  22106 non-null  object 
 7   Age             22106 non-null  int64  
 8   Neighbourhood   22106 non-null  object 
 9   Scholarship     22106 non-null  int64  
 10  Hipertension    22106 non-null  int64  
 11  Diabetes        22106 non-null  int64  
 12  Alcoholism      22106 non-null  int64  
 13  Handcap         22106 non-null  int64  
 14  SMS_received    22106 non-null  int64  
dtypes: float64(1), int64(9), object(5)
memory usage: 2.5+ MB


## Eseguo il drop delle colonne non significative

In [9]:
for df in arr_df:
    df.drop("Unnamed: 0",axis=1,inplace=True)
    df.drop("AppointmentID",axis=1,inplace=True)
    

### Manipolazione Dataset

In [10]:
def manipulate(df):
    
    #Trasformo qui la colonna No-show in binario perche mi serve per aggregare
    df['No-show'] = df["No-show"].apply(lambda x: 0 if x == "No" else 1)
    
    
    #Trasformo le stringhe in date senza time
    df["AppointmentDay"]=pd.to_datetime(df["AppointmentDay"]).dt.date
    df["ScheduledDay"]=pd.to_datetime(df["ScheduledDay"]).dt.date
    
    #Aggiungo feature Interval (differenza tra AppointmentDay e ScheduledDay)
    df["Interval"] = (df["AppointmentDay"]- df["ScheduledDay"])
    
    #Elimino le righe con prenotazione successiva alla data dell'esame... (Interval<0)
    df.drop(df[df["Interval"]  < pd.Timedelta(0, unit='d')].index, inplace=True)
    
    #Numero di appuntamenti presenti del paziente
    df['SommaAppuntamenti'] = df.sort_values(by = ['PatientId','ScheduledDay']).groupby(['PatientId']).cumcount()
    
    #Trovo il giorno della settimana della prenotazione e dell'appuntamento
    df['ScheduledWeekday'] = df.apply(lambda x: x.ScheduledDay.isoweekday(), axis = 1)
    df['AppointmentWeekday'] = df.apply(lambda x: x.AppointmentDay.isoweekday(), axis = 1)
    
    #Se ha un handicap o meno
    df['HasHandicap'] = df['Handcap'].apply(lambda x: 0 if x == 0 else 1)
    
    #Se ha una malattia tra ipertensione, diabete e alcolismo
    df['isWeak'] = df.apply(lambda x: ((x.Hipertension == 1 )| x.Diabetes == 1 | x.Alcoholism == 1)*1, axis = 1)
   
    return df

In [11]:
for df in arr_df:
    df = manipulate(df)

In [12]:
for col in df_train.columns:
    print(col,df_train[col].value_counts(normalize=True),"\n")

No-show 0    0.79639
1    0.20361
Name: No-show, dtype: float64 

PatientId 8.221459e+14    0.000829
9.963767e+10    0.000754
2.688613e+13    0.000709
3.353478e+13    0.000633
6.684488e+13    0.000618
                  ...   
9.761626e+12    0.000015
8.696716e+12    0.000015
2.715915e+13    0.000015
5.972713e+11    0.000015
5.868530e+13    0.000015
Name: PatientId, Length: 44387, dtype: float64 

Gender F    0.647837
M    0.352163
Name: Gender, dtype: float64 

ScheduledDay 2016-05-03    0.038183
2016-05-02    0.037625
2016-05-05    0.037338
2016-05-16    0.037202
2016-05-10    0.037006
2016-05-09    0.035106
2016-04-29    0.033342
2016-05-04    0.032769
2016-05-11    0.032482
2016-05-18    0.031381
2016-05-24    0.031351
2016-05-30    0.031246
2016-05-17    0.031185
2016-05-06    0.030763
2016-05-12    0.030160
2016-05-19    0.029044
2016-05-13    0.027913
2016-05-20    0.027521
2016-05-25    0.027054
2016-05-31    0.025877
2016-06-02    0.025380
2016-06-01    0.025169
2016-04-28    0

### Labeling delle variabili categoriche

In [13]:
categorical_features = df_train.select_dtypes(include=["object","timedelta64","float64"]).columns
excluded_features = ["ScheduledDay","AppointmentDay"]
categorical_features

Index(['PatientId', 'Gender', 'ScheduledDay', 'AppointmentDay',
       'Neighbourhood', 'Interval'],
      dtype='object')

In [14]:
label_df_train=df_train.copy()
label_df_test=df_test.copy()

arr_label_df = [label_df_train,label_df_test]

label_encoder=LabelEncoder()
for label_df in arr_label_df:
    for col in categorical_features:
        if col not in excluded_features:
            label_df[col]=label_encoder.fit_transform(label_df[col])
    # Tratto le date come ordinal, così da avere correlazione tra test e train
    label_df["ScheduledDay"] = label_df["ScheduledDay"].apply(lambda x : x.toordinal())
    label_df["AppointmentDay"] = label_df["AppointmentDay"].apply(lambda x : x.toordinal())


In [15]:
label_df_train

Unnamed: 0,No-show,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Interval,SommaAppuntamenti,ScheduledWeekday,AppointmentWeekday,HasHandicap,isWeak
0,1,8655,0,736069,736102,29,66,0,0,0,0,0,0,33,0,5,3,0,0
1,0,5944,0,736121,736121,5,41,0,0,0,0,0,0,0,0,1,1,0,0
2,0,37617,1,736074,736094,64,39,0,0,0,0,0,1,20,0,3,2,0,0
3,0,40559,0,736103,736103,66,55,0,1,0,0,0,0,0,0,4,4,0,1
4,0,8209,1,736114,736114,72,59,0,0,0,0,0,0,0,3,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66310,0,15786,0,736103,736117,11,70,0,0,0,0,0,1,14,3,4,4,0,0
66311,0,30175,1,736074,736095,50,78,0,1,1,0,0,1,21,0,3,3,0,1
66312,0,40128,0,736116,736116,42,1,0,0,0,0,0,0,0,2,3,3,0,0
66313,1,10730,1,736067,736089,13,5,0,0,0,0,0,1,22,0,3,4,0,0


In [16]:
label_df_test

Unnamed: 0,No-show,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Interval,SommaAppuntamenti,ScheduledWeekday,AppointmentWeekday,HasHandicap,isWeak
0,0,874,1,736121,736123,9,35,0,0,0,0,0,0,2,0,1,3,0,0
1,0,8724,1,736109,736114,50,1,0,0,1,0,0,1,5,2,3,1,0,1
2,0,807,0,736095,736095,69,38,0,0,0,0,0,0,0,0,3,3,0,0
3,0,13131,0,736096,736100,22,23,0,0,0,0,0,0,4,0,4,1,0,0
4,0,14844,0,736101,736101,1,56,0,0,0,0,0,0,0,0,2,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22101,0,16563,0,736086,736102,9,18,0,0,0,0,0,0,16,0,1,3,0,0
22102,0,18174,0,736097,736104,6,14,0,0,0,0,0,0,7,0,5,5,0,0
22103,0,15387,0,736080,736088,50,38,0,0,0,0,0,1,8,0,2,3,0,0
22104,0,13404,1,736103,736103,15,38,0,0,0,0,0,0,0,1,4,4,0,0


In [17]:
target="No-show"

ground_truth_train=label_df_train[target]
other_features_train=label_df_train.drop(target, axis=1)

ground_truth_test=label_df_test[target]
other_features_test=label_df_test.drop(target, axis=1)

### Addestramento, predizione e accuracy

In [18]:
model = LogisticRegression()
model.fit(other_features_train,ground_truth_train)
val_pred=model.predict(other_features_test)

print("Using Logistic Regression", accuracy_score(ground_truth_test,val_pred.round())*100, "%")


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Using Logistic Regression 80.19904998869035 %
