In [18]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from ISLP.models import ModelSpec as MS
import warnings 
warnings.filterwarnings('ignore') # mute warning messages
from sklearn.metrics import confusion_matrix, accuracy_score


In [5]:
df = pd.read_csv('asthma_disease_data_processed.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Gender,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,...,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Ethnicity_0,Ethnicity_1,Ethnicity_2,Ethnicity_3,Diagnosis
0,0,63,0,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,...,0,1,0,0,1,0.0,1.0,0.0,0.0,0
1,1,26,1,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,...,0,0,1,1,1,0.0,0.0,1.0,0.0,0
2,2,57,0,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,...,1,1,0,1,1,0.0,0.0,1.0,0.0,0
3,3,40,1,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,...,0,1,1,1,0,0.0,0.0,1.0,0.0,0
4,4,61,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,...,1,1,0,0,1,1.0,0.0,0.0,0.0,0
5,5,21,0,0,21.812975,0,0.470044,1.759118,9.549262,1.711446,...,0,1,0,0,1,0.0,0.0,1.0,0.0,0
6,6,45,1,1,30.245954,1,9.371784,7.030507,5.746128,7.664306,...,1,1,0,0,0,0.0,1.0,0.0,0.0,0
7,7,26,0,1,26.048416,1,8.344096,1.626484,6.431179,6.939046,...,0,0,1,1,1,1.0,0.0,0.0,0.0,0
8,8,49,1,2,32.676204,0,2.690256,3.920034,5.843645,3.180421,...,1,1,1,0,0,0.0,1.0,0.0,0.0,0
9,9,45,1,1,29.910298,0,2.89572,2.6077,7.234908,1.711722,...,0,0,0,1,1,0.0,1.0,0.0,0.0,0


## Is our sample biased? Data Visualization: 

In [6]:
fig = px.histogram(df, x='Age', nbins = 10, height=600,width=1000, title = 'Age Distribution')
fig.update_layout(bargap=0.1)
fig.show()

In [7]:
fig = px.histogram(df, x='Gender', nbins = 2, height=600,width=1000, title = 'Gender Distribution')
fig.update_layout(bargap=0.5)
fig.show()

In [8]:
fig = px.histogram(df, x='EducationLevel', nbins = 4, height=600,width=1000, title = 'Education Distribution')
fig.update_layout(bargap=0.3)
fig.show()

#### Conclusion: It seems like the Age and Gender are evenly distributed and the Education fits the general population's education

## Model Fitting: KNN

In [9]:
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [47]:
# Model fit and predict (Neighbours = 5)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
train = knn.predict(X_train)
y_pred = knn.predict(X_test)
[accuracy_score(y_train, train),accuracy_score(y_test, y_pred)]


[0.9613173026659697, 0.9498956158663883]

In [48]:
# Model fit and predict (Neighbours = 3)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
train = knn.predict(X_train)
y_pred = knn.predict(X_test)
[accuracy_score(y_train, train),accuracy_score(y_test, y_pred)]

[0.9681129116570831, 0.954070981210856]

In [49]:
# Model fit and predict (Neighbours = 1)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
train = knn.predict(X_train)
y_pred = knn.predict(X_test)
[accuracy_score(y_train, train),accuracy_score(y_test, y_pred)]

[1.0, 0.9478079331941545]

In [50]:
# Model fit and predict (Neighbours = 10)
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
train = knn.predict(X_train)
y_pred = knn.predict(X_test)
[accuracy_score(y_train, train),accuracy_score(y_test, y_pred)]

[0.9560899111343439, 0.9519832985386222]

### Conclusion: Using the KNN model with n_neighbours = 3 yield the highest accuracy_score, so we should use the KNN model with n_neighbours = 3