In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the dataset.

In [None]:
pdata = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
pdata.head()

# Performing EDA using Pandas Profiling.
https://pypi.org/project/pandas-profiling/

In [None]:
from pandas_profiling import ProfileReport
#EDA using Pandas Profiling
dataset = pdata
profile = ProfileReport(dataset, title='Pandas Profiling Report')
profile.to_widgets()

# Displaying EDA done by pandas profiling in iframe

In [None]:
profile.to_notebook_iframe()

# Creating html of EDA report.

In [None]:
profile.to_file("your_report.html")

# Performing manual EDA.

In [None]:
pdata.shape # Check number of columns and rows in data frame

In [None]:
pdata.isnull().values.any() # If there are any null values in data set

In [None]:
sns.pairplot(pdata,diag_kind='kde')

In [None]:
# Correlation 
corr=pdata.corr()
f,ax=plt.subplots(1,1,figsize=(12,8))
sns.heatmap(corr, annot=True, cmap="YlGnBu", ax=ax)

In [None]:
pdata.describe()

In [None]:
pdata.info()

We can see that there are many outliers in the data:
1. **BMI**
    * A BMI of less than 18.5 means that a person is underweight. 
    * A BMI of between 18.5 and 24.9 is ideal. 
    * A BMI of between 25 and 29.9 is overweight. 
    * A BMI over 30 indicates obesity.


2. **Blood Pressure**
As a general guide: 
    * Ideal blood pressure is considered to be between 90/60mmHg and 120/80mmHg. 
    * High blood pressure is considered to be 140/90mmHg or higher. 
    * Low blood pressure is considered to be 90/60mmHg or lower.

3. **Glucose**
For the majority of healthy individuals, 
    * Normal blood sugar levels are as follows: Between 4.0 to 5.4 mmol/L (72 to 99 mg/dL) when fasting. 
    * Up to 7.8 mmol/L (140 mg/dL) 2 hours after eating.

4. **SkinThickness**
For adults, the standard normal values for triceps skinfolds are (see TableH): 2.5mm (men) or about 20% fat; 
18.0mm (women) or about 30% fat

In [None]:
df=pdata.loc[(pdata.BMI>10) & (pdata.BloodPressure>20) & (pdata.Glucose>25)]
df.head()

In [None]:
df.shape

In [None]:
df.loc[(df.SkinThickness<5)& (df.Outcome==0), 'SkinThickness']=int(df[(df.Outcome==0)]['SkinThickness'].median())
df.loc[(df.SkinThickness<5)& (df.Outcome==1), 'SkinThickness']=int(df[(df.Outcome==1)]['SkinThickness'].median())
df.head()

In [None]:
df.loc[(df.Insulin==0)& (df.Outcome==0), 'Insulin']=int(df[(df.Outcome==0)]['Insulin'].median())
df.loc[(df.Insulin==0)& (df.Outcome==1), 'Insulin']=int(df[(df.Outcome==1)]['Insulin'].median())
df.head()

In [None]:
n_true = len(df.loc[df['Outcome'] == True])
n_false = len(df.loc[df['Outcome'] == False])
print("Number of true cases: {0} ({1:2.2f}%)".format(n_true, (n_true / (n_true + n_false)) * 100 ))
print("Number of false cases: {0} ({1:2.2f}%)".format(n_false, (n_false / (n_true + n_false)) * 100))

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Outcome',axis=1)     # Predictor feature columns (8 X m)
Y = df['Outcome']   # Predicted class (1=True, 0=False) (1 X m)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number

x_train.head()

In [None]:
from sklearn import metrics

from sklearn.linear_model import LogisticRegression

# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
#predict on test
y_predict = model.predict(x_test)


coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)

In [None]:
model_score = model.score(x_test, y_test)
print(model_score)