# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno
from imblearn .over_sampling import SMOTE

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix , f1_score


In [None]:
diabetes_df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
diabetes_df.head()

In [None]:
print(diabetes_df.shape)

## Get a statsistics insights and deep inforamtion about our dataset

In [None]:
diabetes_df.info()

In [None]:
diabetes_df.describe().T

*Note: The min value for "Glucose , BloodPressure , SkinThickness , Insulin and BMI" is 0 
which is not logic value for such features*
#### so we should deal with it , by Replacing the 0 value with null

## Replacing the unlogically value (0) with null

In [None]:

diabetes_df.loc[:,'Glucose'].replace(0 , np.NaN , inplace=True)
diabetes_df.loc[:,'BloodPressure'].replace(0 , np.NaN , inplace=True)
diabetes_df.loc[:,'SkinThickness'].replace(0 , np.NaN , inplace=True)
diabetes_df.loc[:,'BMI'].replace(0 , np.NaN , inplace=True)



## Check the null values

In [None]:
diabetes_df.isnull().sum()

In [None]:
diabetes_df.isnull().value_counts()

In [None]:
plt.figure(figsize=(3,3))
msno.matrix(diabetes_df)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(diabetes_df.isna() , cmap='Blues')
plt.show()

## dealing with the null values

In [None]:
diabetes_df.loc[:,'Glucose'].fillna(diabetes_df.loc[:,'Glucose'].mean(),inplace=True)
diabetes_df.loc[:,'BloodPressure'].fillna(diabetes_df.loc[:,'BloodPressure'].mean(),inplace=True)
diabetes_df.loc[:,'SkinThickness'].fillna(diabetes_df.loc[:,'SkinThickness'].mean(),inplace=True)
diabetes_df.loc[:,'BMI'].fillna(diabetes_df.loc[:,'BMI'].mean(),inplace=True)

In [None]:
diabetes_df

# Check imbalnce dataset

In [None]:
diabetes_df['Outcome'].value_counts()

> Note: Our data set is imbalnced as the count of diabetic is consided half the non diabetic 

## Visualizing the count of class_label

In [None]:
sns.countplot(data = diabetes_df , x='Outcome')

In [None]:
plt.figure(figsize=(10,5))
col = ['non diabetics','diabetics']
px.pie(diabetes_df , values=diabetes_df['Outcome'].value_counts(),names=col,
       color_discrete_sequence=px.colors.sequential.RdBu)

## Dealing with the imbalance dataset

# Get the features and class label

In [None]:
X = diabetes_df.drop('Outcome',axis=1)
y = diabetes_df['Outcome']


# Preprocessing Our Dataset Using StandardScaler 

In [None]:
X_scaler = StandardScaler().fit_transform(X)

# Splitting the dataset to training(80%)and testing (20%)set

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_scaler,y,random_state=0,test_size=0.20)

print('The shape of X_train:'+' '+str(X_train.shape))
print('The size of X_train:'+' '+str(X_train.shape[0]))
print('The shape of X_test:'+' '+str(X_test.shape))
print('The size of X_test:'+' '+str(X_test.shape[0]))

# Training our dataset via Model

## KNN model

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=10).fit(X_train , y_train)
knn_clf.score(X_train , y_train)

In [None]:
pred = knn_clf.predict(X_test)
print(accuracy_score(y_true=y_test , y_pred=pred))

### Note our data set is imbalnce , so the best performance evaluation is f1_score

In [None]:
target_name = []
classification_report(y_true=y_test , y_pred=pred,output_dict=True)
