In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 

import warnings
warnings.filterwarnings('ignore')

In [None]:
file = "../input/diabetes/diabetes.csv"
df = pd.read_csv(file)
df.head(4)

### Checking for missing values.

In [None]:
df.isna().sum()

### Description of the entire DataFrame.

In [None]:
df.describe()

### Checking how many people there are with diabetes.

In [None]:
df["Outcome"].value_counts().to_frame()

### Check which feature has the greatest impact on the final result.

In [None]:
cor = df.corr().iloc[ : , -1].sort_values(ascending=False)[1:].to_frame()
sns.set(font_scale=1.6)
plt.figure(figsize=(12, 8))
sns.heatmap(data= cor, cmap="jet",center=0.1, annot=True, vmax=.5, linewidths=0.1,annot_kws={"size": 16})
plt.show()

### Division of a data set into a dependent variable and an independent variable.

In [None]:
# independent variable
X = df.iloc[ : , : -1]

# dependent variable
y = df.Outcome

### The division of values into a training set and a test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=321)

### Creating an estimator model and training it.

In [None]:
model = LogisticRegression(max_iter=125)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)

print(f"The model correctly classifies with {score*100:.2f}% accuracy.")

y_pred = model.predict(X_test)
cm = pd.DataFrame(confusion_matrix(y_test, y_pred))
cm

In [None]:
print(f"The model predicts disease with the following accuracy:\n\n\
Healthy people - classified as\n\t\
No disease = {cm[0][0]}/{cm.loc[0].sum()}\n\tDisease = {cm[1][0]}/{cm.loc[0].sum()}\n\n\
\
Sick people - diabetic - classified as\n\t\
No disease = {cm[0][1]}/{cm.loc[1].sum()}\n\tDisease = {cm[1][1]}/{cm.loc[1].sum()}")

### Check the predictions as you may have diabetes or not.

In [None]:
# Pregnancies: Number of times pregnant
PREGNANCIES = int(df.Pregnancies.mean())

# Plasma glucose concentration over 2 hours in an oral glucose tolerance test 
GLUCOSE = int(df.Glucose.mean())

# Diastolic blood pressure (mm Hg)
BLOODPRESSURE = int(df.BloodPressure.mean())

# Triceps skin fold thickness (mm)
SKINTHICKNESS = int(df.SkinThickness.mean())

# 2-Hour serum insulin (mu U/ml)
INSULIN = int(df.Insulin.mean())

# Body mass index (weight in kg/(height in m)2)
BMI = df.BMI.mean()

# DiabetesPedigreeFunction: Diabetes pedigree function (a function which scores likelihood of diabetes based on family history)
DIABETESPEDIGREEFUNCTION = df.DiabetesPedigreeFunction.mean()

# Age (years)
AGE = int(df.Age.mean())

In [None]:
sample = {
    'Pregnancies': PREGNANCIES, 
    'Glucose': GLUCOSE, 
    'Bloodpressure': BLOODPRESSURE, 
    'Skinthickness': SKINTHICKNESS, 
    'Insulin': INSULIN,
    'Bmi': BMI, 
    'Diabetespedigreefunction': DIABETESPEDIGREEFUNCTION, 
    'Age':AGE,
}

In [None]:
if None in sample.values():
    print("Please do not leave any variable with the 'None' value.")
else:
    trial = pd.DataFrame.from_dict(data=sample, orient='index').T
    print(f"Outcome: {model.predict(trial)[0]} - (0 if non-diabetic, 1 if diabetic)")
    print(f"The probability of no disease {model.predict_proba(trial)[0][0]*100:.2f}%\nThe probability of disease {model.predict_proba(trial)[0][1]*100:.2f}%")