**Importing Packages**

In [48]:
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

**Function to  load data**

In [38]:
def load_data(train_file = 'train_under.csv', test_file = 'test_under.csv'):
    
    train = pd.read_csv(train_file)
    test = pd.read_csv(test_file)
    train['Diabetes'] = train.Diabetes.replace(to_replace=['NO', 'YES'], value=[0, 1])
    test['Diabetes'] = test.Diabetes.replace(to_replace=['NO', 'YES'], value=[0, 1])
    Y_train = train.iloc[ : , 0]
    Y_test = test.iloc[ : , 0]
    X_train = train.iloc[ : , 1 : ]
    X_test = test.iloc[ : , 1 : ]
    
    return X_train , X_test , Y_train , Y_test

**Loading the data**

In [39]:
X_train , X_test , Y_train , Y_test = load_data()

**Checking the shapes of training and testing data**

In [46]:
print('Shape of X_train: ' , X_train.shape)
print('\nShape of Y_train: ' , Y_train.shape)
print('\nShape of X_test: ' , X_test.shape)
print('\nShape of Y_test: ' , Y_test.shape)

Shape of X_train:  (964, 11)

Shape of Y_train:  (964,)

Shape of X_test:  (244, 11)

Shape of Y_test:  (244,)


There are **964** training samples and **244** test samples after cleaning the data and undersampling. There are 11 different features in the data.

**Building a Logistic Regression Model**

In [51]:
def Logistic_Regression(X , Y):
    
    model = LogisticRegression(max_iter = 10000)
    model = model.fit(X , Y.ravel())
    return model

log_model = Logistic_Regression(X_train , Y_train)

def Results(model , X , Y):    
    
    acc = model.score(X , Y)
    err =  1 - model.score(X , Y)
    pred = model.predict(X)
    return acc , err , pred

log_acc_tr , log_err_tr , log_pred_tr = Results(log_model , X_train , Y_train)
log_acc_te , log_err_te , log_pred_te = Results(log_model , X_test , Y_test)

print("Accuracy on training data : " , log_acc_tr)
print("\nClassification error on training data : " , log_err_tr)
print("\nAccuracy on testing data : " , log_acc_te)
print("\nClassification error on testing data : " , log_err_te)

Accuracy on training data :  0.7624481327800829

Classification error on training data :  0.23755186721991706

Accuracy on testing data :  0.7909836065573771

Classification error on testing data :  0.2090163934426229
