In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# This is new
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
# Which data
dataset = 'diabetes'
# fetch data 

if dataset == 'admission':
    admission_data = pd.read_csv('admission_data_ng.csv')
    admission_data.head()
else:
    diabetes_data = pd.read_csv('diabetes.csv')
    diabetes_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

In [None]:
# plot the features against the classification
if dataset == 'admission':
    X0 = admission_data[admission_data['Admit'] == 0]
    X1 = admission_data[admission_data['Admit'] == 1]
    plt.scatter(X0['Test 1 Score'], X0['Test 2 Score'], color = 'red', marker = 'o', label = 'reject')
    plt.scatter(X1['Test 1 Score'], X1['Test 2 Score'], color = 'blue', marker = 'x', label = 'admit')
    plt.xlabel('Test 1 Score')
    plt.ylabel('Test 2 Score')
    plt.legend(loc='lower left')
    plt.show()
else: 
    X0 = diabetes_data[diabetes_data['Outcome'] == 0]
    X1 = diabetes_data[diabetes_data['Outcome'] == 1]
    for col in diabetes_data.drop(columns=['Outcome']).columns: 
        plt.scatter(X0[col], X0['Outcome'], color = 'red', marker = 'o', label = 'No diabetes')
        plt.scatter(X1[col], X1['Outcome'], color = 'blue', marker = 'x', label = 'Diabetic')
        plt.xlabel(col)
        plt.ylabel('Outcome')
        plt.legend(loc='center right')
        plt.show()

In [None]:
if True & (dataset != 'admission'):
    len(diabetes_data)
    diabetes_data = diabetes_data[diabetes_data['Glucose'] > 0]
    diabetes_data = diabetes_data[diabetes_data['BloodPressure'] > 0]
    diabetes_data = diabetes_data[diabetes_data['SkinThickness'] > 0]
    diabetes_data = diabetes_data[diabetes_data['BMI'] > 0]
    len(diabetes_data)

In [None]:
if dataset == 'admission':
    admission_data.isna().sum()
else:
    diabetes_data.isna().sum()

In [None]:
if dataset == 'admission':
    X_train, X_test, y_train, y_test = train_test_split(admission_data.drop(columns = ['Admit']), admission_data['Admit'], test_size=0.2, stratify = admission_data['Admit'], random_state=50)
else: 
    X_train, X_test, y_train, y_test = train_test_split(diabetes_data.drop(columns = ['Outcome']), diabetes_data['Outcome'], test_size=0.2, stratify = diabetes_data['Outcome'], random_state=50)
# In the above split the stratify = y essentially makes sure the fractions of the classification is maintained
X_train
X_test
y_train
y_test

In [None]:
if False: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
    X_train
    X_test
    y_train
    y_test

In [None]:
model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'none')
# If the lbfgs throws an error, try to increase max_iter (add max_iter = 1000), 
# also try another algorithm e.g. newton-cg, scaling is also suggested
# While using multiclass case do multi_class = 'ovr' or 'auto'; can also try other solvers
# While doing regularization, use penalty = 'l2' and also C = 10.0 (need to try other values too)

model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

In [None]:
if dataset == 'admission':
    test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Admit'])
else:
    test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Outcome'])
test_output.head()

In [None]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Percentage of correct predictions is ')
print(model.score(X_test, y_test))

In [None]:
test_output = test_output.merge(X_test, left_index = True, right_index = True)
test_output.head()

In [None]:
if dataset == 'admission': 
    model.predict_proba(admission_data.drop(columns = ['Admit']))
else:
    model.predict_proba(diabetes_data.drop(columns = ['Outcome']))

In [None]:
data_with_prob = X_train.copy()
if dataset == 'admission':
    data_with_prob['Admit'] = y_train
else: 
    data_with_prob['Outcome'] = y_train
# Next we give the probability of predicting 1 (in multiclass, there will be probabilities by class)
if dataset == 'admission':
    data_with_prob['Probability'] = model.predict_proba(data_with_prob.drop(columns = ['Admit']))[:,1]
else: 
    data_with_prob['Probability'] = model.predict_proba(data_with_prob.drop(columns = ['Outcome']))[:,1]
data_with_prob.head()

In [None]:
if dataset == 'admission':
    test_output['Probability'] = model.predict_proba(test_output.drop(columns = ['Admit', 'pred_Admit']))[:,1]
else:
    test_output['Probability'] = model.predict_proba(test_output.drop(columns = ['Outcome', 'pred_Outcome']))[:,1]
test_output.head()

#### Visualize data

In [None]:
# plot the features against the classification [Training]
if dataset == 'admission':
    X0_right = data_with_prob[(data_with_prob['Admit'] == 0) & (data_with_prob['Probability'] < 0.5)]
    X1_right = data_with_prob[(data_with_prob['Admit'] == 1) & (data_with_prob['Probability'] >= 0.5)]
    X0_wrong = data_with_prob[(data_with_prob['Admit'] == 0) & (data_with_prob['Probability'] >= 0.5)]
    X1_wrong = data_with_prob[(data_with_prob['Admit'] == 1) & (data_with_prob['Probability'] < 0.5)]

    plt.scatter(X0_right['Test 1 Score'], X0_right['Test 2 Score'], color = 'red', marker = 'o', label = 'reject accurate')
    plt.scatter(X1_right['Test 1 Score'], X1_right['Test 2 Score'], color = 'blue', marker = 'x', label = 'admit accurate')
    plt.scatter(X0_wrong['Test 1 Score'], X0_wrong['Test 2 Score'], color = 'black', marker = 'o', label = 'reject inaccurate')
    plt.scatter(X1_wrong['Test 1 Score'], X1_wrong['Test 2 Score'], color = 'cyan', marker = 'x', label = 'admit inaccurate')
    plt.xlabel('Test 1 Score')
    plt.ylabel('Test 2 Score')
    plt.legend(loc='lower left')
    plt.show()

In [None]:
if dataset != 'admission':
    for col in diabetes_data.drop(columns=['Outcome']).columns:
        X0_right = data_with_prob[(data_with_prob['Outcome'] == 0) & (data_with_prob['Probability'] < 0.5)]
        X1_right = data_with_prob[(data_with_prob['Outcome'] == 1) & (data_with_prob['Probability'] >= 0.5)]
        X0_wrong = data_with_prob[(data_with_prob['Outcome'] == 0) & (data_with_prob['Probability'] >= 0.5)]
        X1_wrong = data_with_prob[(data_with_prob['Outcome'] == 1) & (data_with_prob['Probability'] < 0.5)]
        plt.scatter(X0_right[col], X0_right['Outcome'], color = 'red', marker = 'o', label = 'not diab accurate')
        plt.scatter(X1_right[col], X1_right['Outcome'], color = 'blue', marker = 'x', label = 'diabetic accurate')
        plt.scatter(X0_wrong[col], X0_wrong['Outcome'] + 0.1, color = 'black', marker = 'o', label = 'not diab inaccurate')
        plt.scatter(X1_wrong[col], X1_wrong['Outcome'] - 0.1, color = 'cyan', marker = 'x', label = 'diabetic inaccurate')
        plt.xlabel(col)
        plt.ylabel('Outcome')
        plt.legend(loc='center right')
        plt.show()

In [None]:
# plot the features against the classification [Testing]
if dataset == 'admission':
    X0_right = test_output[(test_output['Admit'] == 0) & (test_output['Probability'] < 0.5)]
    X1_right = test_output[(test_output['Admit'] == 1) & (test_output['Probability'] >= 0.5)]
    X0_wrong = test_output[(test_output['Admit'] == 0) & (test_output['Probability'] >= 0.5)]
    X1_wrong = test_output[(test_output['Admit'] == 1) & (test_output['Probability'] < 0.5)]

    plt.scatter(X0_right['Test 1 Score'], X0_right['Test 2 Score'], color = 'red', marker = 'o', label = 'reject accurate')
    plt.scatter(X1_right['Test 1 Score'], X1_right['Test 2 Score'], color = 'blue', marker = 'x', label = 'admit accurate')
    plt.scatter(X0_wrong['Test 1 Score'], X0_wrong['Test 2 Score'], color = 'black', marker = 'o', label = 'reject inaccurate')
    plt.scatter(X1_wrong['Test 1 Score'], X1_wrong['Test 2 Score'], color = 'cyan', marker = 'x', label = 'admit inaccurate')
    plt.xlabel('Test 1 Score')
    plt.ylabel('Test 2 Score')
    plt.legend(loc='upper left')
    plt.show()

In [None]:
if dataset != 'admission':
    for col in diabetes_data.drop(columns=['Outcome']).columns: 
        X0_right = test_output[(test_output['Outcome'] == 0) & (test_output['Probability'] < 0.5)]
        X1_right = test_output[(test_output['Outcome'] == 1) & (test_output['Probability'] >= 0.5)]
        X0_wrong = test_output[(test_output['Outcome'] == 0) & (test_output['Probability'] >= 0.5)]
        X1_wrong = test_output[(test_output['Outcome'] == 1) & (test_output['Probability'] < 0.5)]
        plt.scatter(X0_right[col], X0_right['Outcome'], color = 'red', marker = 'o', label = 'not diab accurate')
        plt.scatter(X1_right[col], X1_right['Outcome'], color = 'blue', marker = 'x', label = 'diabetic accurate')
        plt.scatter(X0_wrong[col], X0_wrong['Outcome'] + 0.1, color = 'black', marker = 'o', label = 'not diab inaccurate')
        plt.scatter(X1_wrong[col], X1_wrong['Outcome'] - 0.1, color = 'cyan', marker = 'x', label = 'diabetic inaccurate')
        plt.xlabel(col)
        plt.ylabel('Outcome')
        plt.legend(loc='center right')
        plt.show()