# Notebook Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time as t
from matplotlib import cm
from sklearn.preprocessing import StandardScaler

In [4]:
warnings.filterwarnings('ignore')

# Get The Data

In [5]:
dfx=pd.read_csv('Diabetes_XTrain.csv')

In [6]:
dfx

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,7,168,88,42,321,38.2,0.787,40
1,8,110,76,0,0,27.8,0.237,58
2,7,147,76,0,0,39.4,0.257,43
3,2,100,66,20,90,32.9,0.867,28
4,4,129,86,20,270,35.1,0.231,23
...,...,...,...,...,...,...,...,...
571,2,111,60,0,0,26.2,0.343,23
572,7,187,68,39,304,37.7,0.254,41
573,2,122,60,18,106,29.8,0.717,22
574,4,154,72,29,126,31.3,0.338,37


In [7]:
dfy=pd.read_csv('Diabetes_YTrain.csv')

In [8]:
data=dfx.join(dfy)

In [9]:
# Checking for none values
# Zero represents Diabetic and 1 represent Non- Diabetic
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               576 non-null    int64  
 1   Glucose                   576 non-null    int64  
 2   BloodPressure             576 non-null    int64  
 3   SkinThickness             576 non-null    int64  
 4   Insulin                   576 non-null    int64  
 5   BMI                       576 non-null    float64
 6   DiabetesPedigreeFunction  576 non-null    float64
 7   Age                       576 non-null    int64  
 8   Outcome                   576 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 40.6 KB


In [10]:
frequency=data.Outcome.value_counts()
frequency

0    375
1    201
Name: Outcome, dtype: int64

In [11]:
dfx

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,7,168,88,42,321,38.2,0.787,40
1,8,110,76,0,0,27.8,0.237,58
2,7,147,76,0,0,39.4,0.257,43
3,2,100,66,20,90,32.9,0.867,28
4,4,129,86,20,270,35.1,0.231,23
...,...,...,...,...,...,...,...,...
571,2,111,60,0,0,26.2,0.343,23
572,7,187,68,39,304,37.7,0.254,41
573,2,122,60,18,106,29.8,0.717,22
574,4,154,72,29,126,31.3,0.338,37


In [12]:
dfx[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = dfx[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)


In [13]:
for i in dfx.columns:
    index_val=dfx[dfx[i].isnull()==True][i].index
    dfx[i].loc[index_val]=np.mean(dfx[i])

In [14]:
dfx.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [15]:
scalar=StandardScaler()
dfx=scalar.fit_transform(dfx)

### Bar Plot For Number of Diabetic and Non-Diabetic Patients

In [None]:
plt.figure(figsize=(7,5))
plt.bar(x=frequency.index,height=frequency,width=0.4,tick_label=['Diabetic','Non-Diabetic'],ec='darkred',
        color='pink')
plt.xlabel('Categories',fontsize=14)
plt.ylabel('Frequency',fontsize=14)
plt.title('Counts of Diabetic and Non-Diabetic',fontsize=16)
plt.show()

### Visualising Different Age Groups

In [None]:
plt.figure(figsize=(10,6))
plt.hist(data.Age,bins=15,ec='black',color='cyan')
plt.xlabel('Different Age Groups',fontsize=14)
plt.ylabel('Frequency',fontsize=14)
plt.title('Number of Patient or Non-Patient in Different Age Groups',fontsize=16)
plt.xticks(ticks=np.arange(20,90,10))
plt.show()

In [None]:
non_diabetic_age_groups=data[data.Outcome==1].Age
diabetic_age_groups=data[data.Outcome==0].Age

In [None]:
plt.figure(figsize=(10,6))
plt.hist(non_diabetic_age_groups,bins=10,ec='black',color='cyan',align='left',label='Non-Diabetic')
plt.hist(diabetic_age_groups,bins=10,ec='black',color='navy',alpha=0.3,align='left',label='Diabetic')
plt.xlabel('Different Age Groups',fontsize=14)
plt.ylabel('Frequency',fontsize=14)
plt.title('Number of Patient in Different Age Groups',fontsize=16)
plt.xticks(ticks=np.arange(20,90,10))
plt.legend()
plt.show()
# Conclusion
# Most of the Diabetic patients are below the age of 45.

### Checking The Relationship Between Different Features

In [None]:
start=t.time()
plt.figure(figsize=(10,6))
sns.pairplot(data,kind='reg',hue='Outcome')
plt.show()
end=t.time()
print(end-start)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(data.corr(),annot=True,annot_kws={'size':11},cmap='ocean',fmt='0.2f')
plt.show()

### Visualising The Scatter Plot of  Insulin and SkinThickness

In [None]:
sns.jointplot(x='SkinThickness',y='Insulin',data=data,joint_kws={'color':'darkgreen'})
plt.show()

In [None]:
data.describe()

In [None]:
sns.kdeplot(data.Insulin)
plt.show()
print(data.Insulin.skew())

In [None]:
skew_list=[]
for col in data.columns:
    item=np.around(data[col].skew(),3)
    skew_list.append(item)

In [None]:
skew_data=pd.DataFrame(skew_list,index=data.columns,columns=['Skew value'])

In [None]:
skew_data

## Creating Numpy Arrays From DataFrame

In [None]:
dfx=dfx.values

In [None]:
dfx.shape

## Trying to Rescale The Data

In [16]:
dfy=dfy.values.reshape((-1,))

## Creating Euclidean Distance Function 

In [17]:
def distance(x1,x2):    # Here x1 and x2 are vectors
    return np.sqrt(sum((x1-x2)**2))

## Creating KNN Model Function

In [143]:
def knn(x_vals,y_vals,query_data,k=24):
    vals=[]
    for i in range(x_vals.shape[0]):
        d=distance(query_data,x_vals[i])
        vals.append((d,y_vals[i]))
    vals=sorted(vals)
    vals=vals[:k]
    vals=np.array(vals)
    vals=np.unique(vals[:,1],return_counts=True)
    index=np.argmax(vals[1])
    prediction=vals[0][index]
    return int(prediction)

In [144]:
## Testing of the knn algorithm using unknown data point.

In [145]:
def classify_diabetic(test_data):
    result=[]
    for i in range(test_data.shape[0]):
        value=knn(dfx,dfy,test_data[i])
        result.append(value)
        
    return np.array(result)

In [146]:
# result=classify_diabetic(dfx[556:])
# (result==dfy[556:]).sum()

## Getting the Test Data

In [147]:
test=pd.read_csv('Diabetes_Xtest.csv')

In [148]:
test.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [139]:
test=scalar.fit_transform(test)

In [140]:
output=classify_diabetic(test)

In [141]:
output_df=pd.DataFrame(output)
output_df

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
...,...
187,1
188,0
189,1
190,0


In [142]:
output_df.to_csv('new_submission12.csv',header=['Outcome'],index=False)