# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

In [2]:
data = pd.read_csv("/kaggle/input/dementia-patient-health-and-prescriptions-dataset/dementia_patients_health_data.csv")
data.head(5)

Unnamed: 0,Diabetic,AlcoholLevel,HeartRate,BloodOxygenLevel,BodyTemperature,Weight,MRI_Delay,Prescription,Dosage in mg,Age,...,Smoking_Status,APOE_ε4,Physical_Activity,Depression_Status,Cognitive_Test_Scores,Medication_History,Nutrition_Diet,Sleep_Quality,Chronic_Health_Conditions,Dementia
0,1,0.084974,98,96.230743,36.224852,57.563978,36.421028,,,60,...,Current Smoker,Negative,Sedentary,No,10,No,Low-Carb Diet,Poor,Diabetes,0
1,0,0.016973,78,93.032122,36.183874,56.832335,31.157633,Galantamine,12.0,61,...,Former Smoker,Positive,Moderate Activity,No,1,Yes,Low-Carb Diet,Poor,Heart Disease,1
2,0,0.009,89,93.566504,37.326321,59.759066,37.640435,,,69,...,Former Smoker,Negative,Moderate Activity,No,8,No,Mediterranean Diet,Poor,Heart Disease,0
3,0,0.086437,60,93.90651,37.03062,58.266471,50.673992,Donepezil,23.0,78,...,Never Smoked,Negative,Mild Activity,Yes,5,Yes,Balanced Diet,Poor,Hypertension,1
4,1,0.150747,67,97.508994,36.062121,67.705027,27.810601,Memantine,20.0,77,...,Never Smoked,Positive,Mild Activity,No,0,Yes,Low-Carb Diet,Good,Diabetes,1


# Some Data Exploration

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Diabetic                   1000 non-null   int64  
 1   AlcoholLevel               1000 non-null   float64
 2   HeartRate                  1000 non-null   int64  
 3   BloodOxygenLevel           1000 non-null   float64
 4   BodyTemperature            1000 non-null   float64
 5   Weight                     1000 non-null   float64
 6   MRI_Delay                  1000 non-null   float64
 7   Prescription               485 non-null    object 
 8   Dosage in mg               485 non-null    float64
 9   Age                        1000 non-null   int64  
 10  Education_Level            1000 non-null   object 
 11  Dominant_Hand              1000 non-null   object 
 12  Gender                     1000 non-null   object 
 13  Family_History             1000 non-null   object

Let's check for Dementia status of people who are taking prescriptions.

In [4]:
not_nan_dosage = [ not np.isnan(dosage) for dosage in data['Dosage in mg'] ]
np.unique(data['Dementia'][not_nan_dosage]), len(data['Dementia'][not_nan_dosage])

(array([1]), 485)

In [5]:
not_nan_prescription = [ prescription == prescription for prescription in data['Prescription'] ]
np.unique(data['Dementia'][not_nan_prescription]) , len(data['Dementia'][not_nan_prescription])

(array([1]), 485)

In [6]:
len( data[ data['Dementia'] == 1])

485

Thus, Presence of a prescription and dosage is strong indicator of dementia +ve.

# Data Cleaning

In [7]:
categorical_variables = [ key for key in data.keys() if data.dtypes[key] == 'object' ]
numerical_variables = [ key for key in data.keys() if (data.dtypes[key] == 'int64' or data.dtypes[key] == 'float64') and key != 'Dementia' ]

def data_cleaning(df):
    #removing NaNs
    for variable in ['Prescription','Chronic_Health_Conditions']:
        df[variable] = df[variable].apply(lambda x: x if x == x else 'NA')
    df['Dosage in mg'] = df['Dosage in mg'].fillna(0)
    
    #categorical variables
    for variable in categorical_variables:
        df[variable] = df[variable].astype('category')
    
    # Since, prescription is correlated strongly with Dementia, replacing it with 1 and 0.
    df['Prescription'] = [0 if _ == 'NA' else 1 for _ in df['Prescription'] ]
    
    #encoding some categorical variables
    #data['Dominant_hand'] = [0 if _ == 'Left' else 1 for _ in df['Dominant_hand'] ]
    
    #scaling_numerical_features
    scaler = StandardScaler()
    df[numerical_variables] = scaler.fit_transform(df[numerical_variables])
    return df

In [8]:
data = data_cleaning(data)
categorical_variables = [ key for key in data.keys() if data.dtypes[key] == 'category' ]

In [9]:
categorical_variables

['Education_Level',
 'Dominant_Hand',
 'Gender',
 'Family_History',
 'Smoking_Status',
 'APOE_ε4',
 'Physical_Activity',
 'Depression_Status',
 'Medication_History',
 'Nutrition_Diet',
 'Sleep_Quality',
 'Chronic_Health_Conditions']

# Little more Data Analysis

In [10]:
for variable in categorical_variables:
    categories = data[variable].unique()
    print("\n",variable)
    for category in categories:
        print(category,'No. of dementia patients:\t',len( data.loc[ data[variable].eq(category) & data['Dementia'].eq(1) ]),'out of',len( data.loc[ data[variable].eq(category)]))


 Education_Level
Primary School No. of dementia patients:	 181 out of 389
Secondary School No. of dementia patients:	 155 out of 304
No School No. of dementia patients:	 98 out of 155
Diploma/Degree No. of dementia patients:	 51 out of 152

 Dominant_Hand
Left No. of dementia patients:	 251 out of 519
Right No. of dementia patients:	 234 out of 481

 Gender
Female No. of dementia patients:	 244 out of 504
Male No. of dementia patients:	 241 out of 496

 Family_History
No No. of dementia patients:	 255 out of 480
Yes No. of dementia patients:	 230 out of 520

 Smoking_Status
Current Smoker No. of dementia patients:	 0 out of 90
Former Smoker No. of dementia patients:	 252 out of 458
Never Smoked No. of dementia patients:	 233 out of 452

 APOE_ε4
Negative No. of dementia patients:	 50 out of 306
Positive No. of dementia patients:	 435 out of 694

 Physical_Activity
Sedentary No. of dementia patients:	 158 out of 331
Moderate Activity No. of dementia patients:	 158 out of 318
Mild Activ

In these categorical variables, we can see that in most categories across variables, roughly 50% people suffer from dementia. Also, for APOE_ε4 Positive people prevalence is very high and reverse for APOE_ε4 negative people. Not a single smoker is dementia patient! (I absolutely do not encourage smoking in any way though). However, we won't be making any changes to the dataset based on these observations and leave it to the decision tree in XGBClassifier to figure it out.

In [11]:
data.corr(numeric_only=True)

Unnamed: 0,Diabetic,AlcoholLevel,HeartRate,BloodOxygenLevel,BodyTemperature,Weight,MRI_Delay,Prescription,Dosage in mg,Age,Cognitive_Test_Scores,Dementia
Diabetic,1.0,-0.025616,-0.007188,-0.016575,-0.053851,-0.007501,-0.020378,0.044815,0.04921,0.000923,-0.050019,0.044815
AlcoholLevel,-0.025616,1.0,-0.046341,-0.005562,0.030854,0.034553,0.00416,-0.00372,0.0083,-0.003081,-0.018716,-0.00372
HeartRate,-0.007188,-0.046341,1.0,0.022283,0.015585,-0.00173,0.000169,0.012276,-0.020054,0.011665,0.005493,0.012276
BloodOxygenLevel,-0.016575,-0.005562,0.022283,1.0,-0.007265,-0.041081,-0.00576,-0.071454,-0.062321,0.029565,0.070218,-0.071454
BodyTemperature,-0.053851,0.030854,0.015585,-0.007265,1.0,-0.01775,0.015953,0.033928,0.045258,0.001586,-0.009791,0.033928
Weight,-0.007501,0.034553,-0.00173,-0.041081,-0.01775,1.0,-0.02289,-0.049563,-0.029248,0.011739,0.035821,-0.049563
MRI_Delay,-0.020378,0.00416,0.000169,-0.00576,0.015953,-0.02289,1.0,0.031251,-0.035924,-0.002093,-0.052227,0.031251
Prescription,0.044815,-0.00372,0.012276,-0.071454,0.033928,-0.049563,0.031251,1.0,0.713842,-0.062154,-0.843247,1.0
Dosage in mg,0.04921,0.0083,-0.020054,-0.062321,0.045258,-0.029248,-0.035924,0.713842,1.0,-0.050452,-0.619073,0.713842
Age,0.000923,-0.003081,0.011665,0.029565,0.001586,0.011739,-0.002093,-0.062154,-0.050452,1.0,0.06264,-0.062154


Low correlation of AlcoholLevel, HeartRate, BodyTemperature with Dementia. So these can be safely dropped.

# Model 1

In [12]:
def data_preprocessing1(df):
    df = df.drop( ['AlcoholLevel', 'HeartRate', 'BodyTemperature' ],axis = 1 ) # dropping because of very low correlation
    return df['Dementia'], df.drop('Dementia',axis=1)
y, X = data_preprocessing1(data)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.125, random_state = 42)

In [13]:
xgb_clf = XGBClassifier(n_estimators=350, enable_categorical=True, 
                        colsample_bytree= 0.75, max_depth= 7, early_stopping_round=3, n_jobs=-1, gamma=0.1, verbose=3)

In [14]:
xgb_clf.fit(X_train, y_train)

Parameters: { "early_stopping_round", "verbose" } are not used.



In [15]:
classification_report(y_test,xgb_clf.predict(X_test),output_dict=True)

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 61},
 '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64},
 'accuracy': 1.0,
 'macro avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 125},
 'weighted avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 125}}

# Caution

**Note:-  This 100% accuracy is probably due to strong correlation of Prescription and Dosage attributes with Positive patients. This is probably because they are already diagnosed for Dementia and taking medicines for that, so this makes the task slightly meaningless as well. To try and see whether prediction is possible without Prescription and Dosage attributes, we drop these two attributes and train the model again.**

# Model 2

In [16]:
def data_preprocessing2(df):
    df = df.drop( ['AlcoholLevel', 'HeartRate', 'BodyTemperature', 'Dosage in mg', 'Prescription'],axis = 1 )
    return df['Dementia'], df.drop('Dementia',axis=1)
y, X = data_preprocessing2(data)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.125, random_state = 42)

In [17]:
xgb_clf2 = XGBClassifier(n_estimators=350, enable_categorical=True, 
                        colsample_bytree= 0.75, max_depth= 7, early_stopping_round=3, n_jobs=-1, gamma=0.1, verbose=3)

In [18]:
xgb_clf2.fit(X_train, y_train)

Parameters: { "early_stopping_round", "verbose" } are not used.



In [19]:
classification_report(y_test,xgb_clf2.predict(X_test),output_dict=True)

{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 61},
 '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 64},
 'accuracy': 1.0,
 'macro avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 125},
 'weighted avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 125}}

**It stil predicts as accurately when trained without those attributes.**