In [None]:
!pip install seaborn==0.11.0

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.displot(df,x="Pregnancies", col="Outcome", multiple="dodge")

In [None]:
sns.displot(df,x="Insulin", col="Outcome", multiple="dodge", kde=True)

In [None]:
sns.displot(df,x="BMI", hue="Outcome",  kde=True)

In [None]:
sns.displot(df,x="BloodPressure", hue="Outcome", kde=True)

In [None]:
sns.displot(df,x="Age", col="Outcome", multiple="dodge")

In [None]:
sns.displot(df,x="Glucose", hue="Outcome")

In [None]:
sns.displot(df,x="SkinThickness", col="Outcome", multiple="dodge")

In [None]:
sns.displot(df,x="DiabetesPedigreeFunction", hue="Outcome")

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[["SkinThickness_S", "Insulin_S", "Glucose_S", "BloodPressure_S", "BMI_S"]] = scaler.fit_transform(df[["SkinThickness", "Insulin", "Glucose", "BloodPressure", "BMI"]])

sns.scatterplot(data=df, x="DiabetesPedigreeFunction", y="SkinThickness_S", hue="Outcome")


In [None]:
df.head()

In [None]:
df.var()

In [None]:
features = ["Outcome", "SkinThickness_S", "Insulin_S", "Glucose_S", "Age", "BMI_S", "BloodPressure_S", "DiabetesPedigreeFunction", "Pregnancies"]
sns.pairplot(data=df[features], hue="Outcome")

In [None]:
df[features].corr()

In [None]:
df["Outcome"].value_counts()

In [None]:
def get_original_data():
    X = df[["BloodPressure", "Glucose", "SkinThickness", "Insulin", "BMI", "Age", "Pregnancies", "DiabetesPedigreeFunction"]]
    y = df["Outcome"]
    X.corr()
    return X, y

def get_min_max_scaled_data():
    X = df[["BloodPressure_S", "Glucose_S", "SkinThickness_S", "Insulin_S", "BMI_S", "Age", "Pregnancies", "DiabetesPedigreeFunction"]]
    y = df["Outcome"]
    X.corr()
    return X, y

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import pprint

def logistic_regression(X_train, y_train, X_test, y_test):
    model = LogisticRegression(solver="newton-cg", random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_s = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    #print(f"features: {X_train.columns}")
    #print(f"Coef:\n {model.coef_}")
    
    return round(accuracy,2), round(precision,2), round(recall, 2), round(f1_s, 2)

def k_nearest_neighbors(X_train, y_train, X_test, y_test, n_neighbors):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_s = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return round(accuracy,2), round(precision,2), round(recall, 2), round(f1_s, 2)

In [None]:
X_original, y_original = get_original_data()
X_scaled, y_scaled = get_min_max_scaled_data()
def train_knn_original_data(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_original, y_original, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    accuracy = []

    for i in range(2, 20):
        result = k_nearest_neighbors(X_train, y_train, X_test, y_test, i)
        accuracy.append((i, result[0], result[1], result[2]))
        print(f"neighbors: {i}, accuracy: {result[0]}, precision: {result[1]}, recall:  {result[2]}, f1: {result[3]}")
        
def train_logistic_regression_original(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_original, y_original, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    accuracy = []
    result = logistic_regression(X_train, y_train, X_test, y_test)
    print(result[0], result[1], result[2], result[3])
    

def train_knn_scaled_data(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    accuracy = []

    for i in range(2, 20):
        result = k_nearest_neighbors(X_train, y_train, X_test, y_test, i)
        accuracy.append((i, result[0], result[1], result[2]))
        print(f"neighbors: {i}, accuracy: {result[0]}, precision: {result[1]}, recall:  {result[2]}, f1: {result[3]}")
        
def train_logistic_regression_scaled(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    accuracy = []
    result = logistic_regression(X_train, y_train, X_test, y_test)
    print(result[0], result[1], result[2], result[3])

In [None]:
print(f"Train knn on original data")
train_knn_original_data()

print(f"Train LR on original data")
train_logistic_regression_original()

In [None]:
print(f"=================== Exclude SkinThickness and BloodPressure from original ===========================")
print(f"Train knn on original data")
train_knn_original_data(["SkinThickness", "BloodPressure", "DiabetesPedigreeFunction"])

print(f"Train LR on original data")
train_logistic_regression_original(["Age"])

In [None]:
print(f"============================== Scaled Data ====================================")
print(f"Train knn on scaled data")
train_knn_scaled_data()

print(f"Train LR on scaled data")
train_logistic_regression_scaled()

In [None]:
print(f"=================== Exclude SkinThickness_S and BloodPressure_S ===========================")
print(f"Train knn")
train_knn_scaled_data(["BloodPressure_S"])
print("Train LR")
train_logistic_regression_scaled(["Age"])

### Decision Tree ###

In [None]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree(X_train, X_test, y_train, y_test, depth):
    model = DecisionTreeClassifier(random_state=0, max_depth=depth)
    clf = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_s = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1_s

def train_decision_tree_original(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_original, y_original, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    for i in range(2, 10):
        result = decision_tree(X_train, X_test, y_train, y_test, i)
        print(i, result[0], result[1], result[2], result[3])
        
def train_decision_tree_scaled(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    for i in range(2, 10):
        result = decision_tree(X_train, X_test, y_train, y_test, i)
        print(i, result[0], result[1], result[2], result[3])

In [None]:
print(f"Original Data")
train_decision_tree_original()

print(f"\nOriginal Data Drop Cols")
train_decision_tree_original(["SkinThickness", "BloodPressure"])

print(f"\n Scaled Data")
train_decision_tree_scaled()

print(f"\n Scaled Data Drop Cols")
train_decision_tree_scaled(["Age"])

### Random Forest ###

In [None]:
from sklearn.ensemble import RandomForestClassifier
def random_forest(X_train, X_test, y_train, y_test, depth):
    model = RandomForestClassifier(max_depth=depth, random_state=0)
    clf = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_s = f1_score(y_test, y_pred)
    return round(accuracy,2), round(precision, 2), round(recall, 2), round(f1_s,2)

def train_random_forest_original(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_original, y_original, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    for i in range(2, 15):
        result = random_forest(X_train, X_test, y_train, y_test, i)
        print(i, result[0], result[1], result[2], result[3])
        
def train_random_forest_scaled(cols=None):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.20, random_state=42)
    if cols is not None:
        X_train = X_train.drop(cols, axis=1)
        X_test = X_test.drop(cols, axis=1)
    for i in range(2, 15):
        result = random_forest(X_train, X_test, y_train, y_test, i)
        print(i, result[0], result[1], result[2], result[3])

In [None]:
print(f"Original Data")
train_random_forest_original()

print(f"\nOriginal Data Drop Cols")
train_random_forest_original(["BloodPressure"])

print(f"\n Scaled Data")
train_random_forest_scaled()

print(f"\n Scaled Data Drop Cols")
train_random_forest_scaled(["SkinThickness_S", "BloodPressure_S"])