In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,f1_score,classification_report
import os
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

sns.set_style("darkgrid")


In [None]:
diabet_data=pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
diabet_data.head()

In [None]:
diabet_data.isnull().sum()

In [None]:
diabet_data.describe()

In [None]:
for i,j  in enumerate(diabet_data.columns):
    pearson_coefficient, p_val = stats.pearsonr(diabet_data[str(j)], diabet_data['Outcome'])
    print(str(j),":","Pearson:",pearson_coefficient,'p_value: ',p_val)

In [None]:
def convert_agefeature(df):
    df['Age'] = df['Age'].astype(int)
    df.loc[df['Age'] <= 11, 'Age'] = 0
    df.loc[(df['Age'] > 11) & (df['Age'] <= 18), 'Age'] = 1
    df.loc[(df['Age'] > 18) & (df['Age'] <= 22), 'Age'] = 2
    df.loc[(df['Age'] > 22) & (df['Age'] <= 27), 'Age'] = 3
    df.loc[(df['Age'] > 27) & (df['Age'] <= 33), 'Age'] = 4
    df.loc[(df['Age'] > 33) & (df['Age'] <= 40), 'Age'] = 5
    df.loc[(df['Age'] > 40) & (df['Age'] <= 65), 'Age'] = 6
    df.loc[ df['Age'] > 65, 'Age'] = 7
    return df

In [None]:
def convert_glucosefeature(df):
    df['Glucose'] = df['Glucose'].astype(int)
    df.loc[df['Glucose'] <= 139, 'Glucose'] = 0
    df.loc[(df['Glucose'] > 139) & (df['Glucose'] <= 199), 'Glucose'] = 1
    df.loc[df['Glucose'] > 199, 'Glucose'] = 2
    
    return df

In [None]:
def convert_BMIfeature(df):
    df['BMI'] = df['BMI'].astype(int)
    df.loc[df['BMI'] <= 18.5, 'BMI'] = 0
    df.loc[(df['BMI'] > 18.5) & (df['BMI'] <= 24.9), 'BMI'] = 1
    df.loc[(df['BMI'] > 24.9) & (df['BMI'] <= 29.9), 'BMI'] = 2
    #df.loc[(df['BMI'] >29.9) & (df['BMI'] <= 34.9), 'BMI'] = 3
    #df.loc[(df['BMI'] > 34.9) & (df['BMI'] <= 39.9), 'BMI'] = 4
    df.loc[ df['BMI'] >  29.9, 'BMI'] = 3
    return df

In [None]:
def convert_BloodPressure(df):
    df['BloodPressure'] = df['BloodPressure'].astype(int)
    df.loc[df['BloodPressure'] < 60, 'BloodPressure'] = 0
    df.loc[(df['BloodPressure'] >= 60) & (df['BloodPressure'] < 90), 'BloodPressure'] = 1
    
    
    df.loc[ df['BloodPressure'] >=  90, 'BloodPressure'] = 2
    return df

In [None]:
import seaborn as sns
sns.boxplot(x=diabet_data['BloodPressure'])

In [None]:
Q1 = diabet_data.quantile(0.25)
Q3 = diabet_data.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
diabet_data_out = diabet_data[~((diabet_data < (Q1 - 1.5 * IQR)) |(diabet_data > (Q3 + 1.5 * IQR))).any(axis=1)]
diabet_data_out.shape

In [None]:
diabet_data_out=convert_agefeature(diabet_data_out)
diabet_data_out=convert_BMIfeature(diabet_data_out)
diabet_data_out=convert_glucosefeature(diabet_data_out)
diabet_data_out=convert_BloodPressure(diabet_data_out)

In [None]:
fig, axes = plt.subplots(2,4, figsize = (14,8), sharex=False, sharey=False)
axes = axes.ravel()
cols = ['Age', 'Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'SkinThickness', 'DiabetesPedigreeFunction', 'BMI']
for i in range(len(cols)):
    sns.boxplot(y=cols[i],data=diabet_data_out, ax=axes[i], palette="Set2")
plt.tight_layout()


In [None]:
diabet_data_out.head()

In [None]:
X=diabet_data_out.drop(['Outcome'],axis=1)
y=diabet_data_out['Outcome']

In [None]:
le = preprocessing.LabelEncoder()
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

#y=le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
# standard scaling 
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
#X_train= sc.fit_transform(X_train)
#X_test= sc.fit_transform(X_test)


In [None]:
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
models={}

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
models[str('Naive Bayes')]={'model':nb}
predicted = nb.predict(X_test)
print(accuracy_score(y_test, predicted))
print(precision_score(y_test, predicted,average='micro'))


In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
models[str('Logistic Regression')]={'model':logreg}
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print("Accuracy score: ",accuracy_score(y_test, pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100,max_depth=2,random_state=0)
models[str('GradientBoosting')]={'model':gb}

gb.fit(X_train, y_train)
predicted = gb.predict(X_test)
print("Accuracy score: ",accuracy_score(y_test, predicted))
print("Precision score: ",precision_score(y_test, predicted))

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100,max_depth=3,random_state=0)
models[str('Random Forest')]={'model':random_forest}
random_forest.fit(X_train, y_train)
prediction=random_forest.predict(X_test)
print("Accuracy  score: ",accuracy_score(y_test, prediction))
print("Precision score: ",precision_score(y_test, prediction))


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svclassifier = SVC(C=1.0,kernel='linear')
models[str('SVM')]={'model':svclassifier}
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision score: ",precision_score(y_test, y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
models[str('KNN')]={'model':knn}

knnpre = knn.predict(X_test)
print("Accuracy:",accuracy_score(y_test, knnpre))
print("Precision score: ",precision_score(y_test, knnpre))

In [None]:
classifier=tree.DecisionTreeClassifier(max_depth=3)
classifier.fit(X_train,y_train)
predictions=classifier.predict(X_test)

print(accuracy_score(y_test,predictions))
print(precision_score(y_test,predictions))

In [None]:
for i,j in enumerate(models):
    #kfold = KFold(n_splits=10, random_state=10) 
    accuracy = cross_val_score(models[j]['model'], X, y, scoring='accuracy', cv = 10)
    print("Accuracy of Model"+str(models[j])+" with Cross Validation is:",accuracy.mean() * 100)       