In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score,roc_curve
import pickle


In [4]:
data=pd.read_csv("datasets/diabetes_data_upload.csv")
x=data.drop(['class'],axis=1)
y=data['class']
#x=data.iloc[:,:-1].values
#y=data.iloc[:,-1].values

In [5]:
data.tail()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative
519,42,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative


In [6]:
data['Age'].describe()

count    520.000000
mean      48.028846
std       12.151466
min       16.000000
25%       39.000000
50%       47.500000
75%       57.000000
max       90.000000
Name: Age, dtype: float64

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [8]:
cat_columns=x.select_dtypes(include='object').columns
num_columns=x.select_dtypes(exclude='object').columns
print(cat_columns,num_columns)

Index(['Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness',
       'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching',
       'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity'],
      dtype='object') Index(['Age'], dtype='object')


In [9]:
Scaler=StandardScaler()
x_Encoder=OneHotEncoder()
y_Encoder=LabelEncoder()

In [10]:
Transforming=ColumnTransformer(
    [("OneHotEncoder", x_Encoder, cat_columns),
     ("StandardScaler", Scaler, num_columns)
    ]
)

In [11]:
x_train_tr=Transforming.fit_transform(x_train)
x_test_tr=Transforming.transform(x_test)

In [12]:
x_train.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity
11,38,Male,Yes,Yes,No,No,Yes,Yes,No,Yes,No,Yes,No,Yes,No,No
246,46,Male,No,No,No,Yes,No,No,No,Yes,No,Yes,No,No,Yes,No
304,69,Female,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,No,No,Yes,No,Yes
388,43,Male,No,No,No,Yes,No,Yes,No,Yes,No,No,No,No,Yes,No
352,67,Male,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes


In [13]:
print(x_train_tr[0])

[ 0.          1.          0.          1.          0.          1.
  1.          0.          1.          0.          0.          1.
  0.          1.          1.          0.          0.          1.
  1.          0.          0.          1.          1.          0.
  0.          1.          1.          0.          1.          0.
 -0.81871459]


In [14]:
y_train_le=y_Encoder.fit_transform(y_train)
y_test_le=y_Encoder.transform(y_test)

In [16]:
models_dict={
    "LogisticRegression":LogisticRegression(),
    "SupportVector":SVC(),
    "NaiveBayes":GaussianNB(),
    "KnnClassifier":KNeighborsClassifier(),
    "DecisionTree":DecisionTreeClassifier(),
    "RandomForest":RandomForestClassifier(),
    "AdaBoost":AdaBoostClassifier(),
    "GradientBoost":GradientBoostingClassifier(),    
}

In [17]:
for i in range(len(list(models_dict))):
    model=list(models_dict.values())[i]
    model.fit(x_train_tr,y_train_le)

    y_train_pred=model.predict(x_train_tr)
    y_test_pred=model.predict(x_test_tr)

    train_acc=accuracy_score(y_train_le,y_train_pred)
    train_f1=f1_score(y_train_le,y_train_pred,average='weighted')
    train_prec=precision_score(y_train_le,y_train_pred,average='weighted')
    train_recall=recall_score(y_train_le,y_train_pred,average='weighted')
    #train_roc=roc_auc_score(y_train,y_train_pred,average='weighted',multi_class="ovo")

    test_acc=accuracy_score(y_test_le,y_test_pred)
    test_f1=f1_score(y_test_le,y_test_pred,average='weighted')
    test_prec=precision_score(y_test_le,y_test_pred,average='weighted')
    test_recall=recall_score(y_test_le,y_test_pred,average='weighted')
    #test_roc=roc_auc_score(y_test,y_test_pred,average='weighted',multi_class='ovo')

    print(list(models_dict.keys())[i])
    print("Training AccuracyScore:{:.4f}".format(train_acc))
    print("Training F1Score:{:.4f}".format(train_f1))
    print("Training PrecisionScore:{:.4f}".format(train_prec))
    print("Training RecallScore:{:.4f}".format(train_recall))
    #print("RocAocScore:{:.4f}".format(train_roc))
    print("Test AccuracyScore:{:.4f}".format(test_acc))
    print("Test F1Score:{:.4f}".format(test_f1))
    print("Test PrecisionScore:{:.4f}".format(test_prec))
    print("Test RecallScore:{:.4f}".format(test_recall))
    print('\n')


LogisticRegression
Training AccuracyScore:0.9538
Training F1Score:0.9539
Training PrecisionScore:0.9540
Training RecallScore:0.9538
Test AccuracyScore:0.8846
Test F1Score:0.8848
Test PrecisionScore:0.8860
Test RecallScore:0.8846


SupportVector
Training AccuracyScore:0.9897
Training F1Score:0.9897
Training PrecisionScore:0.9899
Training RecallScore:0.9897
Test AccuracyScore:0.9538
Test F1Score:0.9539
Test PrecisionScore:0.9544
Test RecallScore:0.9538


NaiveBayes
Training AccuracyScore:0.9000
Training F1Score:0.8998
Training PrecisionScore:0.8996
Training RecallScore:0.9000
Test AccuracyScore:0.8308
Test F1Score:0.8304
Test PrecisionScore:0.8305
Test RecallScore:0.8308


KnnClassifier
Training AccuracyScore:0.9462
Training F1Score:0.9465
Training PrecisionScore:0.9485
Training RecallScore:0.9462
Test AccuracyScore:0.9462
Test F1Score:0.9462
Test PrecisionScore:0.9464
Test RecallScore:0.9462


DecisionTree
Training AccuracyScore:1.0000
Training F1Score:1.0000
Training PrecisionScore:1.0



AdaBoost
Training AccuracyScore:0.9590
Training F1Score:0.9590
Training PrecisionScore:0.9591
Training RecallScore:0.9590
Test AccuracyScore:0.8923
Test F1Score:0.8926
Test PrecisionScore:0.8945
Test RecallScore:0.8923


GradientBoost
Training AccuracyScore:1.0000
Training F1Score:1.0000
Training PrecisionScore:1.0000
Training RecallScore:1.0000
Test AccuracyScore:0.9615
Test F1Score:0.9616
Test PrecisionScore:0.9617
Test RecallScore:0.9615




In [18]:
pickle.dump(Transforming,open('models/Transformer.pkl','wb'))
pickle.dump(y_Encoder,open('models/Encoder.pkl','wb'))
pickle.dump(models_dict["RandomForest"],open('models/Classifier.pkl','wb'))