In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
import pickle
import joblib

In [2]:
data = pd.read_csv("../data/heart.csv")
data.shape

(1000, 14)

In [3]:
data.head()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


In [4]:
y = data["target"]
X = data.drop(columns=['patientid','target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [5]:
len(X_train), len(X_test)

(800, 200)

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
X_train.shape

(800, 12)

In [8]:
scaler.mean_

array([4.9766250e+01, 7.6875000e-01, 1.0075000e+00, 1.5193750e+02,
       3.1404625e+02, 3.1250000e-01, 7.5000000e-01, 1.4558625e+02,
       5.0875000e-01, 2.7481250e+00, 1.5600000e+00, 1.2412500e+00])

In [9]:
X_train[:, 0].min(), X_train[:, 0].max(), X_train[:, 2].min(), X_train[:, 2].max()

(-1.6725143415733175,
 1.6987823617198097,
 -1.0532897487880573,
 2.08305689772725)

In [10]:
X_train[0]

array([ 0.40645196, -1.82327297, -1.05328975,  0.93522777,  1.53631292,
       -0.67419986,  0.32547228,  1.35413438, -1.01765584, -0.49016407,
        1.43899306,  1.79450419])

In [11]:
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test,lr_predict))

confussion matrix
[[ 92   3]
 [  2 103]]


Accuracy of Logistic Regression: 97.5 

              precision    recall  f1-score   support

           0       0.98      0.97      0.97        95
           1       0.97      0.98      0.98       105

    accuracy                           0.97       200
   macro avg       0.98      0.97      0.97       200
weighted avg       0.98      0.97      0.97       200



In [12]:
lr_predict

array([1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1], dtype=int64)

In [13]:
lr_predict_proba = lr.predict_proba(X_test)
lr_predict_proba.shape

(200, 2)

In [14]:
lr_predict_probability = lr_predict_proba[:,1]
lr_predict_probability

array([9.99873491e-01, 9.93955607e-01, 9.99715299e-01, 3.28492090e-03,
       9.54330660e-01, 2.77199626e-02, 9.78709934e-01, 3.91418168e-02,
       9.99998245e-01, 9.66572330e-01, 9.72944260e-01, 9.97779989e-01,
       5.96723958e-05, 9.75360567e-01, 1.46121256e-04, 9.77752831e-01,
       9.99998372e-01, 9.98765685e-01, 1.86527173e-02, 7.19404602e-02,
       6.95410953e-04, 2.56167118e-01, 5.37121749e-04, 9.99895964e-01,
       9.39226976e-01, 7.31287233e-04, 8.93425386e-02, 9.99289957e-01,
       9.98713435e-01, 1.04573218e-02, 3.01254766e-02, 2.91644188e-02,
       9.93920962e-01, 9.99997648e-01, 9.99766574e-01, 9.80937191e-01,
       9.99998514e-01, 1.40852780e-02, 3.70550449e-03, 9.17631277e-01,
       3.29007861e-02, 2.38874292e-01, 9.83508694e-01, 3.02269416e-02,
       9.99997294e-01, 9.99725274e-01, 1.85448272e-03, 2.58104940e-03,
       5.29169054e-02, 9.11284832e-01, 1.81170160e-04, 6.59719688e-02,
       9.06410404e-01, 9.76853602e-01, 2.21265774e-01, 9.98076700e-01,
      

In [15]:
lr_predict_probability.shape

(200,)

In [16]:
# save the model to disk
filename = '../saved_models/logistic_regression_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [17]:
# save the scaler to disk
scaler_filename = "../saved_models/logistic_regression_model_scaler"
joblib.dump(scaler, scaler_filename) 

['../saved_models/logistic_regression_model_scaler']

# Front End

In [18]:
# Load the scaler
scaler_filename = "../saved_models/logistic_regression_model_scaler"
scaler = joblib.load(scaler_filename)

In [19]:
# load the model from disk
filename = '../saved_models/logistic_regression_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [20]:
# Prediction - Based on FE values
features_values = np.array([53, 1, 2, 171, 0, 0, 1, 147, 0, 5.3, 3, 3]).reshape(1, -1)

In [21]:
features_values.shape

(1, 12)

In [22]:
scaled_features_values = scaler.transform(features_values)
scaled_features_values.shape



(1, 12)

In [23]:
fe_model_output = loaded_model.predict(scaled_features_values)
fe_model_output.shape

(1,)

In [24]:
fe_model_output

array([1], dtype=int64)

In [25]:
fe_model_proba_output = loaded_model.predict_proba(scaled_features_values)
fe_model_proba_output.shape

(1, 2)

In [26]:
fe_predict_probability = fe_model_proba_output[:,1]
fe_predict_probability

array([0.99987987])