**Data Collecting**

**importing required libraries
 **

In [None]:
import numpy as np
import pandas as pd

**saving diabetes.csv into variable**

In [None]:
dataset=pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.info()

**no null values in dataset**

In [None]:
dataset.isin([0]).sum()

**glucose,bp,skintck,nsulin,bmi contains 0 which is not appropriate replace 0 with nan**

**Data Cleaning**

In [None]:
dataset_nan = dataset.replace({
            'Glucose': 0,
            'BloodPressure' : 0,
            'SkinThickness' : 0,
            'BMI' : 0 ,
            'Insulin' : 0,
        },np.NaN)   

**replace nan with appropriate values**

In [None]:
dataset_nnan=dataset_nan.fillna(dataset_nan.mean())

In [None]:
dataset_nnan.isnull().sum()

**saving cleaned dataset**

In [None]:
#dataset_nnan.to_csv("data.csv")

**Train-Test Split**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.pairplot(dataset_nnan, hue='Outcome', palette=('#23C552','#C52219'))


**Splitting into input and output variables**

In [None]:
from sklearn.model_selection import train_test_split
x = dataset_nnan.drop(['Outcome'],axis=1)
y = dataset_nnan['Outcome']

split in to train and test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=32)

In [None]:
x_train.shape

In [None]:
x_train[:1]

**Scale train data and save the variabke for further use**

In [None]:
from sklearn.preprocessing import StandardScaler
import joblib 

sc= StandardScaler()
x_scaled= sc.fit_transform(x_train)
joblib.dump(sc, 'std_scaler.bin', compress=True)#sc=load('std_scaler.bin')


In [None]:
x_scaled[:1]

**MODEL BUILLDING**

In [None]:
sc=joblib.load('std_scaler.bin')
x_test=sc.transform(x_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier=KNeighborsClassifier(n_neighbors=4)
classifier.fit(x_scaled,y_train)
y_pred=classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_pred,y_test))
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

In [None]:
error_rate = []
# Might take some time
for i in range(1,10):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_scaled,y_train)
    pred_i = knn.predict(x_test)
    error_rate.append(np.mean(pred_i != y_test))
    
    plt.figure(figsize=(10,6))
plt.plot(range(1,20),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

**from figure at k=5 we have minimum error rate  **

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier=KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_scaled,y_train)
y_pred=classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm=(confusion_matrix(y_pred,y_test))
sns.heatmap(cm, annot=True)
from sklearn.metrics import accuracy_score
KNN=accuracy_score(y_test,y_pred)
results = pd.DataFrame(data=[["KNeighborsClassifier", KNN ]], 
                          columns=['Model', 'Accuracy'])
results

**Support vector classifier**

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 10)
classifier.fit(x_scaled, y_train)
y_pred2 = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred2)
sns.heatmap(cm, annot=True)
from sklearn.metrics import accuracy_score 
Accsvm = accuracy_score(y_test, y_pred2)
Accsvm
results1 = pd.DataFrame(data=[["Support vector classifier", Accsvm ]], 
                          columns=['Model', 'Accuracy'])
results = results.append(results1, ignore_index=True)
results

**Desicion Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_scaled, y_train)
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)
from sklearn.metrics import accuracy_score 
AccDT = accuracy_score(y_test, y_pred)
AccDT
results1 = pd.DataFrame(data=[["Desicion Tree Classifier", AccDT ]], 
                          columns=['Model', 'Accuracy'])
results = results.append(results1, ignore_index=True)
results

**Random Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion = "gini", min_samples_leaf = 1, min_samples_split = 10,
n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1
)
classifier.fit(x_scaled, y_train)
y_pred3 = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred3)
sns.heatmap(cm, annot=True)
from sklearn.metrics import accuracy_score 
AccRF = accuracy_score(y_test, y_pred3)
AccRF
results1 = pd.DataFrame(data=[["Random Classifier", AccRF ]], 
                          columns=['Model', 'Accuracy'])
results = results.append(results1, ignore_index=True)
results

**XGBoost classifier**

In [None]:
from xgboost.sklearn import XGBClassifier
classifier=XGBClassifier()
classifier.fit(x_scaled,y_train)
y_pred4=classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred4)
sns.heatmap(cm, annot=True)
from sklearn.metrics import accuracy_score 
AccXGB = accuracy_score(y_test, y_pred4)
AccXGB
results1 = pd.DataFrame(data=[["XGBoost classifier", AccXGB ]], 
                          columns=['Model', 'Accuracy'])
results = results.append(results1, ignore_index=True)
results

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
classifierLR=LogisticRegression(max_iter=100)
classifierLR.fit(x_scaled,y_train)
y_pred=classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
cm=(confusion_matrix(y_pred,y_test))
sns.heatmap(cm, annot=True)
from sklearn.metrics import accuracy_score
LR=(accuracy_score(y_test,y_pred))
results1 = pd.DataFrame(data=[["Logistic Regression", LR ]], 
                          columns=['Model', 'Accuracy'])
results = results.append(results1, ignore_index=True)
results


**ANN Classifier**

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D,MaxPool2D,Flatten
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
model=Sequential()
model.add(Dense(input_dim=x_train.shape[1],kernel_initializer="random_uniform",activation="relu",units=10))

model.add(Dense(kernel_initializer="random_uniform",activation="relu",units=7))
model.add(Dense(activation="relu",units=1))

model.compile(loss="mse",optimizer="adam",metrics=["accuracy"])

model.summary()

In [None]:

model2 = model.fit(x_scaled, y_train,
              validation_data=(x_test,y_test),
              batch_size=2,
              epochs=100)

plt.figure(figsize=(12, 8))

plt.plot(model2.history['loss'], label='loss')
plt.plot(model2.history['val_loss'], label='val_loss')
plt.legend()

In [None]:
model.fit(x_scaled,y_train,epochs=100,batch_size=16)

In [None]:
y_pred=model.predict(x_test)
y_pred=y_pred>0.4

In [None]:
from sklearn.metrics import accuracy_score

sns.heatmap(cm, annot=True)

ANN=accuracy_score(y_test,y_pred)

In [None]:
results1 = pd.DataFrame(data=[["ANN", ANN ]], 
                          columns=['Model', 'Accuracy'])
results = results.append(results1, ignore_index=True)
results

In [None]:
#results.set_index('Model', inplace=True)
results['Accuracy'].plot(kind='barh', figsize=(8, 4))

**It is clear that Logistic Regression has highest accuracy**

***Saving LOgistic regression model***

In [None]:
joblib.dump(classifierLR,'DiabetesPredictionModel.joblib')

In [None]:
#joblib.load('DiabetesPredictionModel.joblib')
loaded_model = joblib.load('DiabetesPredictionModel.joblib')
res = loaded_model.predict(x_test)
ac=accuracy_score(res,y_test)
print(ac)