# Data preprocessing and seperation of final test dataset

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime, date

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
patients = pd.read_csv('../myCSV/modified_patients.csv')
print('Total shape : ',patients.shape)
patients.rename(columns = {'COUNTY':'COUNTRY'}, inplace = True)
patients.head()

Total shape :  (1171, 15)


Unnamed: 0,Id,BIRTHDATE,SSN,FIRST,LAST,RACE,ETHNICITY,GENDER,BIRTHPLACE,CITY,STATE,COUNTRY,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,non_adherence
0,1d604da9-9a81-4ba9-80c2-de3375d59b40,1989-05-25,999-76-6866,José Eduardo181,Gómez206,white,hispanic,M,Marigot Saint Andrew Parish DM,Chicopee,Massachusetts,Hampden County,271227.08,1334.88,1.0
1,034e9e3b-2def-4559-bb2a-7850888ae060,1983-11-14,999-73-5361,Milo271,Feil794,white,nonhispanic,M,Danvers Massachusetts US,Somerville,Massachusetts,Middlesex County,793946.01,3204.49,0.0
2,10339b10-3cd1-4ac3-ac13-ec26728cb592,1992-06-02,999-27-3385,Jayson808,Fadel536,white,nonhispanic,M,Springfield Massachusetts US,Chicopee,Massachusetts,Hampden County,574111.9,2606.4,0.0
3,8d4c4326-e9de-4f45-9a4c-f8c36bff89ae,1978-05-27,999-85-4926,Mariana775,Rutherford999,white,nonhispanic,F,Yarmouth Massachusetts US,Lowell,Massachusetts,Middlesex County,935630.3,8756.19,0.0
4,f5dcd418-09fe-4a2f-baa0-3da800bd8c3a,1996-10-18,999-60-7372,Gregorio366,Auer97,white,nonhispanic,M,Patras Achaea GR,Boston,Massachusetts,Suffolk County,598763.07,3772.2,0.0


Extracting Age using BIRTHDATE 

In [3]:
def age(born):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [4]:
patients['Age'] = patients['BIRTHDATE'].apply(age)
patients[['BIRTHDATE','Age']].head()

Unnamed: 0,BIRTHDATE,Age
0,1989-05-25,32
1,1983-11-14,38
2,1992-06-02,29
3,1978-05-27,43
4,1996-10-18,25


In [5]:
patients.drop('BIRTHDATE',axis = 1 , inplace = True)
patients.isnull().sum()

Id                      0
SSN                     0
FIRST                   0
LAST                    0
RACE                    0
ETHNICITY               0
GENDER                  0
BIRTHPLACE              0
CITY                    0
STATE                   0
COUNTRY                 0
HEALTHCARE_EXPENSES     0
HEALTHCARE_COVERAGE     0
non_adherence          19
Age                     0
dtype: int64

In [None]:
patients.to_csv('../myCSV/modified_patients.csv',index = False)

#### Rows in which non-adherence value is NULL will be treated as final testing dataset. So seperating it in the starting and will be working further with df having no NULL values.

In [6]:
final_testing_dataset = patients[patients['non_adherence'].isnull()]
df = patients[patients['non_adherence'].notnull()]
df['non_adherence'] = df['non_adherence'].apply(np.int64)
final_testing_dataset.shape

(19, 15)

In [None]:
# Total number of duplicate rows
df.duplicated().sum()

In [None]:
print('Number of unique patients = ',df['Id'].nunique())
print('Number of records = ',df.shape[0])

Since, Number of unique patients = Number of records. Hence , each row contains data of a different patient

In [None]:
# seperating target column from main dataframe
# xtrain = df.drop('non-adherence',axis = 1)
# labels = df['non-adherence']
# xtrain.shape

# from sklearn.model_selection import train_test_split
# X_train, X, y_train, y = train_test_split(xtrain,labels,test_size=0.1,train_size=0.9)
# X_test, X_valid, y_test, y_valid = train_test_split(X,y,test_size = 0.5,train_size =0.5)

# print(X_train.shape), print(y_train.shape)
# print(X_valid.shape), print(y_valid.shape)
# print(X_test.shape), print(y_test.shape)

In [None]:
df.shape

In [None]:
#### Train test validation ratio = 90% : 5% : 5%

#### Seperating training data set , for performing Exploratory Data Analysis

In [None]:
train, validate, test = np.split(df.sample(frac=1, random_state=42),[int(.9*len(df)), int(.95*len(df))])
train.head()

In [None]:
print(train.shape)
print(test.shape)
print(validate.shape)

# Exploratory Data Analysis

In [None]:
#Printing number of unique values in of each column
for col in train:
    print(col ,' : ' ,train[col].nunique())

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
sns.set_theme(style="whitegrid")
sns.countplot(data=train, x='RACE', hue='non_adherence',palette="rainbow")
# we can conclude that maximum non_adherence was observed in white Race
# Also we can conclude that people with Race as white are much more than that of other races. 
# So this can be used while handling Race variable. We will encode White as 1 and non-Whites as 0.

In [None]:
sns.set_theme(style="whitegrid")
sns.barplot(x = train['GENDER'], y = train['non_adherence'],color="salmon",saturation=.8)
# Females are slightly more non - adherent than males

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.set_theme(style="whitegrid")
sns.countplot(data=train, y='COUNTRY', hue='non_adherence',palette="Accent")
# Maximum patients are from Middlesex COUNTRY

In [None]:
def Mapping_delay(x):
    temp = []
    for i in list(x):
        if i >= 60:
            temp.append("(>= 60)")
        elif (i >= 25 and i < 60):
            temp.append("(25-60)")
        else:
            temp.append("(< 25)")
    return temp

train['Age_Group'] = Mapping_delay(train['Age'])

sns.set(rc={'figure.figsize':(6,6)})
sns.set_theme(style="whitegrid")
sns.countplot(data=train, x='Age_Group',hue='non_adherence',palette="PuRd")
train.drop('Age_Group',axis = 1, inplace = True)
#From the graph we can clearly see that the people who age than 60, major portion is non - adherent.
# Hence senior citizen are non - adherent

In [None]:
sns.set_theme(style="whitegrid")
sns.catplot(x="non_adherence", y="HEALTHCARE_EXPENSES", data=train,palette="Set1")
#From this graph , we can conclude that the expenses of pateits who are non_adherent are more
# Hence cost plays a major role in determining adherence

#### Determining Outliers 

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x=df["HEALTHCARE_EXPENSES"])
#No such outliers

In [None]:
sns.set(style="ticks")
plt.figure(figsize=(10,6))
sns.boxplot(x=train["HEALTHCARE_COVERAGE"])
plt.xlim(0, 22000)
plt.grid()
plt.show()

In [None]:
# Droping records from dataframe having HEALTHCARE_COVERAGE outside outliers
df.drop(df[df['HEALTHCARE_COVERAGE'] > 22000].index, inplace = True)

# Handling Categorical DataTypes

In [None]:
print(df.columns)

#### First droping columns that would be not play any role in training model <br> Hence droping columns -> Id , SSN , FIRST ,LAST , BIRTHPLACE ,COUNTRY

In [None]:
df.drop(['Id','SSN','FIRST','LAST'], axis = 1, inplace = True)

In [None]:
for col in df:
    print(col ,' : ' ,df[col].nunique() , ' , Dtype : ', df[col].dtypes)

In [None]:
# Since STATE only has one value throught the column (Massachusetts), it would be useless to use it in training model.
# Hence, droping STATE
df.drop(['STATE'], axis = 1, inplace = True)

In [None]:
# Since GENDER only has 2 unique values -> M and F. So we can simply use One-Hot encoding using pandas get_dummies method
gender_encoded=pd.get_dummies(data=df['GENDER'],prefix='GENDER',drop_first=True)
df.drop('GENDER',axis = 1,inplace=True)
df = pd.concat([gender_encoded,df],axis = 1)

In [None]:
#Since ETHNICITY only has 2 unique values, we can simply use One-Hot encoding using pandas get_dummies method
ETHNICITY_encoded=pd.get_dummies(data=df['ETHNICITY'],prefix='ETHNICITY',drop_first=True)
df.drop('ETHNICITY',axis = 1,inplace=True)
df = pd.concat([ETHNICITY_encoded,df],axis = 1)

In [None]:
#As number of White has much values as compared to that of others, we would be performing one hot encoding,
#and then droping all categories except White. 

#Doing this we will successfull encode our model into 2 categories -> White and rest.

RACE_encoded=pd.get_dummies(data=df['RACE'],prefix='RACE',drop_first=True)
df.drop('RACE',axis = 1,inplace=True)
df = pd.concat([RACE_encoded,df],axis = 1)
df.drop(['RACE_black','RACE_native','RACE_other'],axis = 1, inplace = True)
df.head()

#### Frequency Encoding on CITY, COUNTRY, BIRTHPLACE

In [None]:
fe = df.groupby('CITY').size()/len(df)
df['CITY_Encoded'] = df['CITY'].map(fe)

In [None]:
fe = df.groupby('COUNTRY').size()/len(df)
df['COUNTRY_Encoded'] = df['COUNTRY'].map(fe)

In [None]:
fe = df.groupby('BIRTHPLACE').size()/len(df)
df['BIRTHPLACE_Encoded'] = df['BIRTHPLACE'].map(fe)

In [None]:
df.drop(['CITY','COUNTRY','BIRTHPLACE'],axis = 1,inplace=True)
df.head()

# Feature Selection

In [None]:
train, validate, test = np.split(df.sample(frac=1, random_state=42),[int(.9*len(df)), int(.95*len(df))])
print(train.shape)
print(test.shape)
print(validate.shape)

In [None]:
X_train = train.drop('non_adherence',axis = 1)
y_train = train['non_adherence']
X_train.head()
X_test = test.drop('non_adherence',axis = 1)
y_test = test['non_adherence']

X_validate = validate.drop('non_adherence',axis = 1)
y_validate = validate['non_adherence']

X_train.head()

In [None]:
# X_train.drop(['RACE_native','RACE_other'],inplace = True, axis = 1)
# X_test.drop(['RACE_native','RACE_other'],inplace = True, axis = 1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20 ,random_state = 10)
model.fit(X_train, y_train)
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': model.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)

In [None]:
sns.set(rc={'figure.figsize':(6,5)})
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#7FFFD4')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

### From the above observation, we can see that there is no such column that does'nt play role in training.

# Selecting correct Model

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model_logistic = LogisticRegression()
model_logistic.fit(X_train, y_train)
y_predict = model_logistic.predict(X_validate)
print(accuracy_score(y_validate, y_predict))

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
Naive_Bayes = GaussianNB()
Naive_Bayes.fit(X_train, y_train)
y_predict = Naive_Bayes.predict(X_validate)  
print(accuracy_score(y_validate, y_predict))

#### K-Nearest Neighbor Algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_predict = knn.predict(X_validate)  
print(accuracy_score(y_validate, y_predict))

#### SVM

In [None]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)
y_predict = knn.predict(X_validate)  
print(accuracy_score(y_validate, y_predict))

#### Decision Tree

In [None]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier(max_depth=2)
dtc.fit(X_train, y_train)
y_predict = dtc.predict(X_validate)  
print(accuracy_score(y_validate, y_predict))

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
rf = RandomForestClassifier(max_depth=2)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_validate)  
print(accuracy_score(y_validate, y_predict))

### From above accuracy score, we can conclude that random forest classifier performs the best


# Preparing and Prediction of final_testing_dataset

In [None]:
# Dataset had all the values of non_adherence as Nan
final_testing_dataset.head(3)

In [None]:
final_testing_dataset.drop(['Id','SSN','FIRST','LAST','STATE','non_adherence'], axis = 1, inplace = True)

#### Handling Categorical DataTypes

In [None]:
#### One Hot encoding on GENDER, ETHNICITY, RACE
gender_encoded=pd.get_dummies(data=final_testing_dataset['GENDER'],prefix='GENDER',drop_first=True)
final_testing_dataset.drop('GENDER',axis = 1,inplace=True)
final_testing_dataset = pd.concat([gender_encoded,final_testing_dataset],axis = 1)

ETHNICITY_encoded=pd.get_dummies(data=final_testing_dataset['ETHNICITY'],prefix='ETHNICITY',drop_first=True)
final_testing_dataset.drop('ETHNICITY',axis = 1,inplace=True)
final_testing_dataset = pd.concat([ETHNICITY_encoded,final_testing_dataset],axis = 1)

RACE_encoded=pd.get_dummies(data=final_testing_dataset['RACE'],prefix='RACE',drop_first=True)
final_testing_dataset.drop('RACE',axis = 1,inplace=True)
final_testing_dataset = pd.concat([RACE_encoded,final_testing_dataset],axis = 1)
final_testing_dataset.head()

#### Frequency Encoding on CITY, COUNTRY, BIRTHPLACE

fe = final_testing_dataset.groupby('CITY').size()/len(final_testing_dataset)
final_testing_dataset['CITY_Encoded'] = final_testing_dataset['CITY'].map(fe)

fe = final_testing_dataset.groupby('COUNTRY').size()/len(final_testing_dataset)
final_testing_dataset['COUNTRY_Encoded'] = final_testing_dataset['COUNTRY'].map(fe)

fe = final_testing_dataset.groupby('BIRTHPLACE').size()/len(final_testing_dataset)
final_testing_dataset['BIRTHPLACE_Encoded'] = final_testing_dataset['BIRTHPLACE'].map(fe)

final_testing_dataset.drop(['CITY','COUNTRY','BIRTHPLACE'],axis = 1,inplace=True)
final_testing_dataset.head()

#### Performance of X_test data set

In [None]:
y_predict = rf.predict(X_test)  
print(accuracy_score(y_test, y_predict))

#### Predicting values on final_testing_dataset

In [None]:
final_testing_dataset['Predicted_non_adherence'] = rf.predict(final_testing_dataset)  

In [None]:
# Predicted values
final_testing_dataset['Predicted_non_adherence']

#### Dumping Model as .pickle

In [None]:
# import pickle
# pickle.dump(rf, open('../Trained Model/AdherenceModel', 'wb'))

#### User Exapmle

In [None]:
#Taking an example 
# Name : mayank chittora
# Age : 21
# Race : Asian
# ethnicity: nonhispanic
# gender : Male
# HEALTHCARE_EXPENSES : 200000
# HEALTHCARE_COVERAGE : 10000
#CITY :0.052632
#COUNTRY : 0.052632
#BIRTHPLACE : 0.105263

In [None]:
df_user = pd.DataFrame({"RACE_white":[1],"ETHNICITY_nonhispanic":[1], "GENDER_M":[1],"HEALTHCARE_EXPENSES":[200000],"HEALTHCARE_COVERAGE":[10000],"Age":[21],"CITY_Encoded":[0.052632],"COUNTRY_Encoded":[0.052632],"BIRTHPLACE_Encoded":[0.105263]})
df_user.head()

In [None]:
dtc.predict(df_user)