In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# loading the data
pima = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
# 
pima.head()

In [None]:
# Dataset has some 0 values instead of Null values
for column in pima.columns:
    print("{col} has ".format(col = column),pima[pima[column]==0][column].count(),"zeros")

In [None]:
# For data preporcessing, a copy is needed.
df = pima.copy()

In [None]:
# 0's in the features below must be filled
# Filled with mean values of itself
features_dealing_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for features in features_dealing_with_zero:
    df[features].replace(0,np.nan,inplace=True)
    df[features].fillna(pima[features].mean(),inplace=True)

In [None]:
df.Insulin = df.Insulin.astype(int)
df.SkinThickness = df.SkinThickness.astype(int)

# 0's in outcome transformed into -1, it is more appropriate
df.Outcome = df.Outcome.replace(0,-1)
df.Outcome = df.Outcome.astype(int)
df.head()

In [None]:
X = df.iloc[:-100,:-1]
x_test = df.iloc[-100:,:-1]
Y = df.iloc[:-100,-1]
y_test = df.iloc[-100:,-1]

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
kfold = KFold(n_splits = 2)
LRclf = LinearRegression()
training_score = []
testing_score = []

# Training on the 2 fold cross validation
for train_index, test_index in kfold.split(X):
#     print(train_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    LRclf.fit(X_train,Y_train)
    training_score.append(LRclf.score(X,Y))
    train_predictions = LRclf.predict(X_test)
    train_predictions[train_predictions < train_predictions.mean()] = -1
    train_predictions[train_predictions >= train_predictions.mean()] = 1
    testing_score.append(accuracy_score(Y_test, train_predictions))
# Results scores
print("training_score{training_score}".format(training_score=training_score))
print("testing_score{testing_score}".format(testing_score=testing_score))

In [None]:
# Performance of at 100 samples 
LR_preds = LRclf.predict(x_test)
LR_preds[LR_preds < LR_preds.mean()] = -1
LR_preds[LR_preds >= LR_preds.mean()] = 1
print(classification_report(y_test, LR_preds))

In [None]:
# Confusion matrix scores
cm = confusion_matrix(y_test, LR_preds)
sns.heatmap(cm, annot=True,cmap='Blues', xticklabels=['-1','1'], yticklabels=['-1','1'])

In [None]:
GNB = GaussianNB()
training_score = []
testing_score = []


# Training on the 2 fold cross validation
for train_index, test_index in kfold.split(X):
#     print(train_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    GNB.fit(X_train,Y_train)
    training_score.append(GNB.score(X,Y))
    train_predictions = GNB.predict(X_test)
    testing_score.append(accuracy_score(Y_test, train_predictions))

In [None]:
# Results scores
print("training_score{training_score}".format(training_score=training_score))
print("testing_score{testing_score}".format(testing_score=testing_score))

In [None]:
# Performance of at 100 samples
GNB_preds = GNB.predict(x_test)
print(classification_report(y_test, GNB_preds))

In [None]:
# Confusion matrix scores
cm = confusion_matrix(y_test, GNB_preds)
sns.heatmap(cm, annot=True,cmap='Blues', xticklabels=['-1','1'], yticklabels=['-1','1'])

In [None]:
df.to_csv("./filled.csv", index=False)