In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About Dataset
## Context

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is **to diagnostically predict whether or not a patient has diabetes**, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

## Content

The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

## Variable Description

- Pregnancies: Number of times pregnant
- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- BloodPressure: Diastolic blood pressure (mm Hg)
- SkinThickness: Triceps skin fold thickness (mm)
- Insulin: 2-Hour serum insulin (mu U/ml)
- BMI: Body mass index (weight in kg/(height in m)^2)
- DiabetesPedigreeFunction: Diabetes pedigree function
- Age: Age (years)
- Outcome: Class variable (0 or 1) 268 of 768 are 1, the others are 0

source: https://www.kaggle.com/sercanyesiloz

overview data: https://www.kaggle.com/uciml/pima-indians-diabetes-database

# Import the libraries

example:

```import pandas as pd
import numpy as np```

In [None]:
# your code here

import pandas as pd
import numpy as np


# Exploratory Data Analysis

## 1. Load/Import the dataset

In [None]:
# your code here

df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

## 2a. Check the data preview and datatype

In [None]:
# your code here

df.head()


## 2b. Check the lenght (number of rows and number of columns)

In [None]:
# your code here


df.shape

## 3. Calculate the Basic Statistic of data


basic statistic including: mean, median, std. dev, etc.

hint: you can use method: .describe()

In [None]:
# your code here


df.describe()

## 4. Check the distribution of data for every numerical features

In [None]:
# your code here


df.hist()

# do with another features

## 5. Handle Missing Values (Null Values) or value = 0 for every numerical features

### 5a. Identify the missing values (null values)

In [None]:
def convert_to_nan(x):
    if x == 0:
        return None
    else:
        return x

In [None]:
# your code here

# Approach 1
# df["Glucose"] = df["Glucose"].apply(lambda x: convert_to_nan(x) )
# df["BloodPressure"] = df["BloodPressure"].apply(lambda x: convert_to_nan(x) )
# df["SkinThickness"] = df["SkinThickness"].apply(lambda x: convert_to_nan(x) )
# df["Insulin"] = df["Insulin"].apply(lambda x: convert_to_nan(x) )
# df["BMI"] = df["BMI"].apply(lambda x: convert_to_nan(x) )

# Approach 2
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols:
    df[col] = df[col].apply(lambda x: convert_to_nan(x))

df.isnull().sum()


### 5b. Fix the missing values

In [None]:
# your code here


# we use mean value to fill NA
median_values = df.median()
df = df.fillna(median_values)

## 6. Handle Outlier (anomaly data)

### 6a. Identify the outliers (anomaly data)

In [None]:
# your code here


df["Insulin"].describe()


In [None]:
# code check for Insulin
q1 = 121.500000
q3 = 127.250000
iqr = q3 - q1

lower_th = q1 - (1.5 * iqr)
upper_th = q3 + (1.5 * iqr)

print("Q1 :", q1)
print("Q3 :", q3)
print("IQR :", iqr)

### 6b. Visualize the outlier by using Boxplot chart

In [None]:
# your code here

df.boxplot(column="Insulin")

### 6c. Fix the outliers

In [None]:
# your code here

def is_outlier(x, lower_th, upper_th):
    if (x < lower_th) or (x > upper_th):
        return True
    else:
        return False

cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols:
    df[col] = df[col].apply(lambda x: convert_to_nan(x))

    q1 = df[col].describe()[4]
    q3 = df[col].describe()[5]
    iqr = q3 - q1

    lower_th = q1 - (1.5 * iqr)
    upper_th = q3 + (1.5 * iqr)

    mean_value = df["Insulin"].mean()

    df[col] = df[col].apply(lambda x : mean_value if is_outlier(x, lower_th, upper_th) else x)


## 7. Create a new feature/column

## 7a. Create a classification of BMI 

(column_name = `bmi_class`)
Here's the rules:
1. Class 1 - Underweight (value= 1): BMI < 18.5
2. Class 2 - Ideal (value= 2): 18.5 <= BMI <= 24.9
3. Class 3 - Overweight (value= 3): 25.0 <= BMI <= 29.9
4. Class 4 - Obese (value = 4): BMI >= 30.0

In [None]:
# your code here

def get_bmi_class(x):
    if x < 18.5:
        return 1
    elif (x >= 18.5) or (x <=24.9):
        return 2
    elif (x >= 25.) or (x <=29.9):
        return 3
    else:
        return 4
    

df['bmi_class'] = df['BMI'].apply(lambda x: get_bmi_class(x))

## 7b. Create an working age indicator
column_name = `is_working_age`

Here's the conditions:
1. Class 1 - non-productive population (value= 0): age <= 15 or age >= 65
2. Class 2 - productive population (value= 1): 15 < age < 65

In [None]:
# your code here

def get_working_age(age):
    if (age <= 15) or (age>=65):
        return 0
    else:
        return 1
    
    
df['is_working_age'] = df['Age'].apply(lambda x: get_working_age(x))


### 7c. Create your own new feature/columns

In [None]:
# your code here




## 8. Get an Insight from the data by answering these questions

### 8a. Question 1
What is the `bmi_class` that have the most diabetes population (`outcomess`)

In [None]:
# your code here

df.groupby('bmi_class').sum()['Outcome']

### 8b. Question 2
What is the **average of the age** based on the `bmi_class`?

hint: use the group by, cross table, or pivot table (choose one)

In [None]:
# your code here

df.groupby('bmi_class').mean()['Age']

# 9. Creata a Machine Learning model 

Objective: to predict the `outcome` (whether has a diabetes or not)

## 9a. Split Dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split
# Example:
df_train, df_test = train_test_split(df, test_size=0.2)

In [None]:
# your code here

x_train = df_train[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'bmi_class',
       'is_working_age']].values

x_test = df_test[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'bmi_class',
       'is_working_age']].values

In [None]:
y_train = df_train['Outcome'].values
y_test = df_test['Outcome'].values

## 9b. Train your binary classification model

Note: You can train more than 1 model

In [None]:
# your code here

from sklearn.linear_model import LogisticRegression

# Build a Logistic Regression Model
model_lr = LogisticRegression(solver='liblinear')


In [None]:
# Train LR model with train data
model_lr.fit(x_train, y_train)


## 9c. Evaluate your models

In [None]:
# your code here

from sklearn.metrics import confusion_matrix

y_predict = model_lr.predict(x_test)
df_test['y_predict'] = y_predict

# calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()

# calculate the metrics
accuracy = (tp + tn) / (tp + tn + fp + fn) 
precision = tp / (tp+fn)
recall = tp / (tp+fp)
f1 = (2 * precision * recall) / (precision + recall)

In [None]:
df_test.head(10)

In [None]:
print("Accuracy: \t", accuracy)
print("Precision: \t", precision)
print("Recall: \t", recall)
print("F1: \t\t", f1)

# Congratulations! You've completed the learning!

Please answer the quiz + give us the feedback about overall program through this link: https://forms.gle/NYhwmp4ACXsLXpmS7