## Logistic Regression

##### Logistic Regression is used to predict a yes or no answer for items in a dataset. For example, this dataset has a series
##### of items you can examine to see if they are risk factors for heart disease.

In [1]:
#import Matplotlib for graphs, NumPy to make arrays, pandas to load CSV file, and scikit-learn for the model
%pip install matplotlib
%pip instal scikit-learn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

Note: you may need to restart the kernel to use updated packages.
ERROR: unknown command "instal" - maybe you meant "install"
Note: you may need to restart the kernel to use updated packages.


In [2]:
#Import data files
HD = pd.read_csv('framingham.csv')

In [3]:
#Examine dataset
HD.shape

(4238, 16)

In [4]:
HD.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [5]:
HD.head(6)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0
5,0,43,2.0,0,0.0,0.0,0,1,0,228.0,180.0,110.0,30.3,77.0,99.0,0


In [6]:
HD.info

<bound method DataFrame.info of       male  age  education  currentSmoker  cigsPerDay  BPMeds  \
0        1   39        4.0              0         0.0     0.0   
1        0   46        2.0              0         0.0     0.0   
2        1   48        1.0              1        20.0     0.0   
3        0   61        3.0              1        30.0     0.0   
4        0   46        3.0              1        23.0     0.0   
...    ...  ...        ...            ...         ...     ...   
4233     1   50        1.0              1         1.0     0.0   
4234     1   51        3.0              1        43.0     0.0   
4235     0   48        2.0              1        20.0     NaN   
4236     0   44        1.0              1        15.0     0.0   
4237     0   52        2.0              0         0.0     0.0   

      prevalentStroke  prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  \
0                   0             0         0    195.0  106.0   70.0  26.97   
1                   0        

In [7]:
#get rid of NaN values
HDC = HD.dropna()

In [8]:
#Statistics like count, mean, standard mediation are calculated using the dataset without NaN values
HDC.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0,3656.0
mean,0.443654,49.55744,1.979759,0.489059,9.022155,0.030361,0.005744,0.311543,0.027079,236.873085,132.368025,82.912062,25.784185,75.73058,81.856127,0.152352
std,0.496883,8.561133,1.022657,0.499949,11.918869,0.171602,0.075581,0.463187,0.162335,44.096223,22.092444,11.974825,4.065913,11.982952,23.910128,0.359411
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.25,144.0,90.0,28.04,82.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


In [9]:
#We choose Total Cholesterol as the possible factor for a ten-year risk of Coronary Heart Disease
X = HDC['totChol']
y = HDC['TenYearCHD']

In [10]:
#Split training and testing data. 70 percent in training, 30 percent in testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size = 0.3)

In [11]:
#Put in an array because sklearn can't use tables, needs a 2 dimensional array
X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train).reshape(-1, 1)
X_test = np.array(X_test).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

In [12]:
#Use Library for Large Linear Classification, for specific regression model. Moves toward the minimum in one direction at a time
model = LogisticRegression(solver='liblinear', random_state=0)

In [13]:
#"Fit" data to the model
model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [14]:
model.classes_

array([0, 1])

In [15]:
model.intercept_


array([-2.5428759])

In [16]:
model.coef_

array([[0.00354878]])

In [17]:
model.predict_proba(X_test)

array([[0.84623402, 0.15376598],
       [0.85480441, 0.14519559],
       [0.8333485 , 0.1666515 ],
       ...,
       [0.85915371, 0.14084629],
       [0.85034419, 0.14965581],
       [0.86422852, 0.13577148]])

In [18]:
model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
model.score(X_test,y_test)

0.8523245214220602

In [20]:
#Make a confusion matrix to show false negatives and positives
confusion_matrix(y_test, model.predict(X_test))

array([[935,   0],
       [162,   0]])

In [22]:
#Show accuracy, macro average and weighted average
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       935
           1       0.00      0.00      0.00       162

    accuracy                           0.85      1097
   macro avg       0.43      0.50      0.46      1097
weighted avg       0.73      0.85      0.78      1097



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### The results aren't great, given the high number of Type II errors. In this example, that means a large number of people were told they don't have heart disease risk factors when they actually do.<BR>
##### There are a number of reasons why this could have happened:<BR>
##### 1. The sample size is too small.
##### 2. We picked the wrong factor, or need to include more than one factor.
##### 3. We need to adjust hyperparameters.