# Diabetes

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier
import warnings
import scipy.stats
warnings.filterwarnings('ignore')

## Editing Data

In [None]:
#I called our dataset and visualized it with the data.head() command.
data = pd.read_csv("/kaggle/input/diabates/diabetes.csv")
data.head()

In [None]:
#I decided whether it is true or false by looking at the types in the data set.
data.info()

In [None]:
#Statistical data brought.
data.describe()

In [None]:
#Of the 768 patients, 500 are not sick.
data.shape

In [None]:
#I looked for empty data, lost data.
data.isna().sum()

In [None]:
#Values ​​like glucose,bloodpressure etc. can not be 0, I have to regulate them.
data.eq(0).sum()  

## Change 0 values (Linear Regression)

In [None]:
#Missing Data Imputation Using Regression


def ImputeZeroValuesWithRegression(dataset):

  columnsToBeImputed = ['Glucose','Insulin','SkinThickness','BMI']
  for column in columnsToBeImputed:

    test_df = dataset[dataset[column]==0]


    y_train= dataset[column]
    x_train= dataset.drop(column,axis=1)

    X_test = test_df.drop(column, axis=1)

    lr=LinearRegression()
    lr.fit(x_train,y_train)
    y_pred=lr.predict(X_test)


    dataset.loc[dataset[column]==0,column] = y_pred

  return dataset
df=ImputeZeroValuesWithRegression(dataset=data)

In [None]:
df

In [None]:
#I changed the 0 values.
#I filled with linear regression.

## Change value (mean)

In [None]:
#I changed the values ​​of the table.
#NaN instead of 0
data[["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"
    ]]=data[[
    "Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]].replace(0,np.NaN) 
    

In [None]:
#filling in the missing values
data.fillna(data.mean(),inplace=True)


In [None]:
#I filled the empty spaces with average.
data.head()

In [None]:
#I changed 0 values.
#I filled it with average.

## scatter features

In [None]:
#scatter plots of features
def scatter(ax,axis,name,title):
  sns.countplot(name,data=data,ax=ax[axis[0]][axis[1]])
  ax[axis[0],axis[1]].set_title(title)

f,ax=plt.subplots(4,2,figsize=(20,15))
plt.suptitle("Scatter plots of features")

features = (((0,0),"Pregnancies","Pregnancy chart feature"),((0,1),"Glucose","Glucose chart feature"),((1,0),"BloodPressure","BloodPressure chart feature"),((1,1),"SkinThickness","SkinThickness chart feature")
,((2,0),"Insulin","Insulin chart feature"),((2,1),"BMI","BMI chart feature"),((3,0),"DiabetesPedigreeFunction","DiabetesPedigreeFunction chart feature"),((3,1),"Age","Age" "chart feature"))

for axis, name, title in features:
  scatter(ax,axis,name,title)


## Order of importance

In [None]:
#order of importance
x=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',
       'Insulin', 'DiabetesPedigreeFunction']]
y=data.iloc[:,8]

model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(20).plot(kind='bar')
plt.show()

In [None]:
#I saw glucose affect more diabetes.

## Correlation Analysis

In [None]:
#The measure of the relationship between them.
data.corr()

In [None]:
#I tried to show each feature among itself.
sns.heatmap(data.corr(),annot=True)

## Clustering

In [None]:
#clustering
x=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',
       'Insulin', 'DiabetesPedigreeFunction']]
y=data.iloc[:,8]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)
kmeans = KMeans(n_clusters=2, random_state=0).fit(x)
kmeans.labels_
count = 0
for first,second in zip(y,kmeans.labels_):
  #print(first,second)
  if first==second:
      count+=1

print(count)
print("Accuracy: ",(count / len(y)) * 100)

## Classification Task:Logistic Regression

In [None]:
#creation of algorithms.
x=data[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',
       'Insulin', 'DiabetesPedigreeFunction']]
y=data.iloc[:,8]

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)

In [None]:
#log reg.
log_reg = LogisticRegression(random_state=1, max_iter=1000)

In [None]:
log_reg.fit(x_train,y_train)

In [None]:
y_pred=log_reg.predict(x_test)

In [None]:
#predict for Logistic Reg.
y_pred

In [None]:
print("Logistic Regression Classifier Training Accuracy: ",log_reg.score(x_test,y_test))

## Classification Task:Random Forest

In [None]:
#randomforest
forest=RandomForestClassifier(n_estimators=20,criterion="entropy",random_state=0)
forest.fit(x_train,y_train)

In [None]:
#predict for forest classifier.
y_pred=forest.predict(x_test)


In [None]:
y_pred

In [None]:
print("Random Forest Classifier Training Accuracy: ",forest.score(x_test,y_test))

## Classification Task:KNN Classifier

In [None]:
#knn
KNN=KNeighborsClassifier(n_neighbors=1)
KNN.fit(x_train,y_train)

In [None]:
#predict for KNN.
y_pred=KNN.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
print("KNN classifier training accuracy: ",KNN.score(x_test,y_test))

## Compare Performance

In [None]:
#performance
KNN_predict =[0 for _ in range(len(y_test)) ]
log_reg_predict=[0 for _ in range(len(y_test)) ]
forest_predict=[0 for _ in range(len(y_test)) ]

In [None]:
model1=KNN.fit(x_train,y_train)
model2=log_reg.fit(x_train,y_train)
model3=forest.fit(x_train,y_train)

In [None]:
KNN_predict=model1.predict_proba(x_test)
log_reg_predict=model2.predict_proba(x_test)
forest_predict=model3.predict_proba(x_test)

In [None]:
KNN_predict=KNN_predict[:,1]
log_reg_predict=log_reg_predict[:,1]
forest_predict=forest_predict[:,1]

In [None]:
KNN_auc=roc_auc_score(y_test,KNN_predict)
log_reg_auc=roc_auc_score(y_test,log_reg_predict)
forest_auc=roc_auc_score(y_test,forest_predict)

In [None]:
print("KNN:ROC AUC=%.3f"%(KNN_auc))
print("log_reg:ROC AUC=%.3f"%(log_reg_auc))
print("forest:ROC AUC=%.3f"%(forest_auc))

In [None]:
KNN_fpr,KNN_tpr,_=roc_curve(y_test,KNN_predict)
log_reg_fpr,log_reg_tpr,_=roc_curve(y_test,log_reg_predict)
forest_fpr,forest_tpr,_=roc_curve(y_test,forest_predict)

In [None]:
pyplot.plot(KNN_fpr,KNN_tpr,linestyle="--",label="KNN")
pyplot.plot(log_reg_fpr,log_reg_tpr,marker=".",label="log_reg")
pyplot.plot(forest_fpr,forest_tpr,marker=".",label="forest")
pyplot.legend()
pyplot.show()

#### This is the most efficient analysis method, as the most area is under logistic regression. 
#### I hope you like it. Please upvote the notebook. Thanks :)