## Import Python Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### Import Dataset

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.head()

In [None]:
# Checking missing values
df.isna().sum()

In [None]:
# Checking the dtypes
df.dtypes

## Exploratory Data Analysis

In [None]:
#describe
df.describe()

In [None]:
# Correlation
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),
           annot=True,
           cmap='Pastel1')
plt.title('Correlation of Diabetes', fontsize=20);

In [None]:
df.columns

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(data=df, x='Age', y='Glucose', color='Black')


In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data=df, x='Age', y='BloodPressure')

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(data=df, x='Age', y='Pregnancies')

In [None]:
plt.figure(figsize=(20,20))
sns.boxplot(data=df, x='Glucose', y='Insulin')

In [None]:
plt.figure(figsize=(10,10))
sns.displot(df['Outcome'])

### Modelling

#### split the dataset

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X_train), len(X_test), len(y_train), len(y_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report


lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_preds = lr.predict(X_test)

print(f" Score : {lr.score(X_test, y_test)}")
print("*"*20)
print(f" Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f" Precision_score : {precision_score(y_test, y_preds)}")
print(f" Recall_score : {recall_score(y_test, y_preds)}")
print(f" F1_score  : {f1_score(y_test, y_preds)}")
print("*" * 20)
print(f" Classification_report : {classification_report(y_test, y_preds)}")

sns.heatmap(confusion_matrix(y_test, y_preds),
            annot=True)
plt.title("Confusion Matrix");

### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_preds = rf.predict(X_test)

print(f" Score : {rf.score(X_test, y_test)}")
print("*"*20)
print(f" Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f" Precision_score : {precision_score(y_test, y_preds)}")
print(f" Recall_score : {recall_score(y_test, y_preds)}")
print(f" F1_score  : {f1_score(y_test, y_preds)}")
print("*" * 20)
print(f" Classification_report : {classification_report(y_test, y_preds)}")

sns.heatmap(confusion_matrix(y_test, y_preds),
            annot=True)
plt.title("Confusion Matrix");

### GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_preds = gb.predict(X_test)

print(f" Score : {gb.score(X_test, y_test)}")
print("*"*20)
print(f" Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f" Precision_score : {precision_score(y_test, y_preds)}")
print(f" Recall_score : {recall_score(y_test, y_preds)}")
print(f" F1_score  : {f1_score(y_test, y_preds)}")
print("*" * 20)
print(f" Classification_report : {classification_report(y_test, y_preds)}")

sns.heatmap(confusion_matrix(y_test, y_preds),
            annot=True)
plt.title("Confusion Matrix");

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(42)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)

print(f" Score : {knn.score(X_test, y_test)}")
print("*"*20)
print(f" Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f" Precision_score : {precision_score(y_test, y_preds)}")
print(f" Recall_score : {recall_score(y_test, y_preds)}")
print(f" F1_score  : {f1_score(y_test, y_preds)}")
print("*" * 20)
print(f" Classification_report : {classification_report(y_test, y_preds)}")

sns.heatmap(confusion_matrix(y_test, y_preds),
            annot=True)
plt.title("Confusion Matrix");

### LinearSVC

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC(random_state=42)
svc.fit(X_train, y_train)
y_preds = svc.predict(X_test)

print(f" Score : {svc.score(X_test, y_test)}")
print("*"*20)
print(f" Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f" Precision_score : {precision_score(y_test, y_preds)}")
print(f" Recall_score : {recall_score(y_test, y_preds)}")
print(f" F1_score  : {f1_score(y_test, y_preds)}")
print("*" * 20)
print(f" Classification_report : {classification_report(y_test, y_preds)}")

sns.heatmap(confusion_matrix(y_test, y_preds),
            annot=True)
plt.title("Confusion Matrix");

In [None]:
# logistic regression gave best accuracy
y_preds = lr.predict(X_test)
y_preds