In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About Dataset
## Context

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is **to diagnostically predict whether or not a patient has diabetes**, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

## Content

The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

## Variable Description

- Pregnancies: Number of times pregnant
- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- BloodPressure: Diastolic blood pressure (mm Hg)
- SkinThickness: Triceps skin fold thickness (mm)
- Insulin: 2-Hour serum insulin (mu U/ml)
- BMI: Body mass index (weight in kg/(height in m)^2)
- DiabetesPedigreeFunction: Diabetes pedigree function
- Age: Age (years)
- Outcome: Class variable (0 or 1) 268 of 768 are 1, the others are 0

source: https://www.kaggle.com/sercanyesiloz

overview data: https://www.kaggle.com/uciml/pima-indians-diabetes-database

# Import the libraries

example:

```import pandas as pd
import numpy as np```

In [None]:
# your code here
import pandas as pd
import numpy as np



# Exploratory Data Analysis

## 1. Load/Import the dataset

In [None]:
# your code here
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df


## 2a. Check the data preview and datatype

In [None]:
# your code here
display(df.head())
display(df.tail())
display(df.sample(5))

df.info()



## 2b. Check the lenght (number of rows and number of columns)

In [None]:
# your code here
df.shape


## 3. Calculate the Basic Statistic of data


basic statistic including: mean, median, std. dev, etc.

hint: you can use method: .describe()

In [None]:
# your code here
df.describe()

## 4. Check the distribution of data for every numerical features

In [None]:
# your code here
df.hist()
# df['Glucose'].hist()


## 5. Handle Missing Values (Null Values) or value = 0 for every numerical features

### 5a. Identify the missing values (null values)

In [None]:
df.isnull().sum()

In [None]:
# your code here
def convert_to_nan(x):
    if x == 0:
        return None
    else:
        return x

In [None]:
df['Insulin']

In [None]:
df['Insulin'].apply(convert_to_nan)

In [None]:
df['Insulin'] = df['Insulin'].apply(convert_to_nan)

In [None]:
df['Insulin']

In [None]:
# your code here
list_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI']

for col in list_cols:
    df[col] = df[col].apply(convert_to_nan)

In [None]:
df['Glucose'] = df['Glucose'].apply(convert_to_nan)
df['BloodPressure'] = df['BloodPressure'].apply(convert_to_nan)
df['SkinThickness'] = df['SkinThickness'].apply(convert_to_nan)
df['Insulin'] = df['Insulin'].apply(convert_to_nan)
df['BMI'] = df['BMI'].apply(convert_to_nan)

In [None]:
df.isnull().sum()

### 5b. Fix the missing values

In [None]:
df.mean()

In [None]:
mean_values = df.mean()
df = df.fillna(mean_values)

In [None]:
df.isnull().sum()

## 6. Handle Outlier (anomaly data)

### 6a. Identify the outliers (anomaly data)

In [None]:
df['Insulin'].describe()['25%']

In [None]:
q1 = df['Insulin'].describe()['25%']
q3 = df['Insulin'].describe()['75%']
iqr = q3-q1

bottom_thershold = q1 - 1.5*iqr
upper_thershold = q3+1.5*iqr

In [None]:
list_cols_all = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'Age','Pregnancies','DiabetesPedigreeFunction']

In [None]:
def is_outlier(x, bottom_thershold,upper_thershold):
    if x < bottom_thershold or x > upper_thershold:
        return True
    return False

### 6b. Visualize the outlier by using Boxplot chart

In [None]:
# your code here
df.boxplot(column='Age')

In [None]:
# your code here



### 6c. Fix the outliers

In [None]:
df.mean()

In [None]:
def fill_mean(x, bottom_thershold, upper_thershold):
    if is_outlier(x, bottom_thershold, upper_thershold):
        return mean_values[col]
    else:
        return x

In [None]:
for col in list_cols_all:
    q1 = df[col].describe()['25%']
    q3 = df[col].describe()['75%']
    iqr = q3-q1

    bottom_thershold = q1 - 1.5*iqr
    upper_thershold = q3+1.5*iqr
    
    df[col] = df[col].apply(lambda x: mean_values[col] 
                    if is_outlier(x, bottom_thershold, upper_thershold) else x)

In [None]:
# your code here
df.mean()

In [None]:
mean_values

## 7. Create a new feature/column

## 7a. Create a classification of BMI 

(column_name = `bmi_class`)
Here's the rules:
1. Class 1 - Underweight (value= 1): BMI < 18.5
2. Class 2 - Ideal (value= 2): 18.5 <= BMI <= 24.9
3. Class 3 - Overweight (value= 3): 25.0 <= BMI <= 29.9
4. Class 4 - Obese (value = 4): BMI >= 30.0

In [None]:
def class_bmi(x):
    if x < 18.5:
        return 1
    if x <= 24.9:
        return 2
    if x <= 29.9:
        return 3
    else:
        return 4

In [None]:
df['BMI']

In [None]:
df['BMI'].apply(class_bmi)

In [None]:
# your code here
df['bmi_class'] = df['BMI'].apply(class_bmi)



In [None]:
df

## 7b. Create an working age indicator
column_name = `is_working_age`

Here's the conditions:
1. Class 1 - non-productive population (value= 0): age <= 15 or age >= 65
2. Class 2 - productive population (value= 1): 15 < age < 65

In [None]:
# your code here
def work_age(x):
    if x <= 15 or x>=65:
        return 0
    else:
        return 1

In [None]:
df['Age']

In [None]:
df['Age'].apply(work_age)

In [None]:
df['is_working_age'] = df['Age'].apply(work_age)
df

### 7c. Create your own new feature/columns

In [None]:
# your code here
def class_age(x):
    if x < 12:
        return 0
    if x <17:
        return 1
    if x < 50:
        return 2
    else:
        return 3


In [None]:
df['age_class'] = df['Age'].apply(class_age)
df

## 8. Get an Insight from the data by answering these questions

In [None]:
df.corr()

### 8a. Question 1
What is the `bmi_class` that have the most diabetes population (`outcomess`)

In [None]:
df.groupby('Outcome')['SkinThickness'].describe()

In [None]:
df.columns

In [None]:
df['Outcome']

In [None]:
# your code here
df.groupby('bmi_class')['Outcome'].sum()

### 8b. Question 2
What is the **average of the age** based on the `bmi_class`?

hint: use the group by, cross table, or pivot table (choose one)

In [None]:
# your code here
df.groupby('bmi_class')['Age'].mean()

# 9. Creata a Machine Learning model 

Objective: to predict the `outcome` (whether has a diabetes or not)

## 9a. Split Dataset into train and test

In [None]:
100

In [None]:
from sklearn.model_selection import train_test_split
# Example:
df_train, df_test = train_test_split(df, test_size=0.2)

In [None]:
df_train

In [None]:
df_test

In [None]:
# your code here




## 9b. Train your binary classification model

Note: You can train more than 1 model

In [None]:
# your code here
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear',random_state=0)

In [None]:
X_train = df_train.drop(['Outcome'],axis=1)
y_train = df_train['Outcome']

X_test = df_test.drop(['Outcome'],axis=1)
y_test = df_test['Outcome']

In [None]:
y_train

## 9c. Evaluate your models

In [None]:
# your code here
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

# Congratulations! You've completed the learning!

Please answer the quiz + give us the feedback about overall program through this link: https://forms.gle/NYhwmp4ACXsLXpmS7