# Logistic Regression
### Dealing with Categorical Variables

In [1]:
import pandas as pd
import numpy as np

#from sklearn.model_selection import train_test_split
#from sklearn.model_selection import KFold
#from sklearn.feature_extraction import DictVectorizer
#from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#to load csv
df = pd.read_csv('logistic_regression_data.csv')
#data taken from Kaggle that has combined 4 datasets to predict heart disease
#https://www.kaggle.com/imnikhilanand/heart-attack-prediction

# Data cleaning

In [3]:
df.head()

Unnamed: 0,age,gender,chest_pain,resting_bps,chol,fasting_blood_sugar,resting_ecg,max_heartrate,ex_angina,oldpeak,target
0,28,1,2,130.0,132.0,0.0,2.0,185.0,0.0,0.0,0
1,29,1,2,120.0,243.0,0.0,0.0,160.0,0.0,0.0,0
2,29,1,2,140.0,,0.0,0.0,170.0,0.0,0.0,0
3,30,0,1,170.0,237.0,0.0,1.0,170.0,0.0,0.0,0
4,31,0,2,100.0,219.0,0.0,1.0,150.0,0.0,0.0,0


In [4]:
#to see if there are any missing values
print(df.shape)
df.isnull().sum()

(294, 11)


age                     0
gender                  0
chest_pain              0
resting_bps             1
chol                   23
fasting_blood_sugar     8
resting_ecg             1
max_heartrate           1
ex_angina               1
oldpeak                 0
target                  0
dtype: int64

### To remove rows with missing values

In [5]:
print(df.shape)

# drop rows with missing values
df.dropna(inplace=True)

# summarize the shape of the data with missing rows removed
print(df.shape)

(294, 11)
(261, 11)


In [6]:
df.isnull().sum()

age                    0
gender                 0
chest_pain             0
resting_bps            0
chol                   0
fasting_blood_sugar    0
resting_ecg            0
max_heartrate          0
ex_angina              0
oldpeak                0
target                 0
dtype: int64

In [7]:
#from https://www.justintodata.com/logistic-regression-example-in-python/
# 5 Categorical variables are 
# gender (1 = male, 0 = female), 
# chest_pain (1 = typical angina, 2 = atypical angina, 3 = non-anginal pain, 4 = asymptomatic)
# fasting_blood_sugar > 120 mg/dl (1 = true; 0 = false)
# resting_ecg (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertroph)
# ex_angina is for exercise-induced angina (1 = yes; 0 = no)

# Among the 5 categorical variables, gender, fasting_blood_sugar, and ex_angina only have two levels of 0 and 1, 
# so they are already in the dummy variable format. 
# But we still need to convert hest_pain and resting_ecg into dummy variables.

In [8]:
df['chest_pain'].value_counts(dropna=False)
# shows 4 classes

4    113
2     92
3     46
1     10
Name: chest_pain, dtype: int64

In [9]:
df['resting_ecg'].value_counts(dropna=False)
# shows 3 classes

0.0    208
1.0     47
2.0      6
Name: resting_ecg, dtype: int64

In [10]:
#To get dummy variables
df = pd.get_dummies(df, columns=['chest_pain', 'resting_ecg'], drop_first=False)

df

Unnamed: 0,age,gender,resting_bps,chol,fasting_blood_sugar,max_heartrate,ex_angina,oldpeak,target,chest_pain_1,chest_pain_2,chest_pain_3,chest_pain_4,resting_ecg_0.0,resting_ecg_1.0,resting_ecg_2.0
0,28,1,130.0,132.0,0.0,185.0,0.0,0.0,0,0,1,0,0,0,0,1
1,29,1,120.0,243.0,0.0,160.0,0.0,0.0,0,0,1,0,0,1,0,0
3,30,0,170.0,237.0,0.0,170.0,0.0,0.0,0,1,0,0,0,0,1,0
4,31,0,100.0,219.0,0.0,150.0,0.0,0.0,0,0,1,0,0,0,1,0
5,32,0,105.0,198.0,0.0,165.0,0.0,0.0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,52,1,160.0,331.0,0.0,94.0,1.0,2.5,1,0,0,0,1,1,0,0
290,54,0,130.0,294.0,0.0,100.0,1.0,0.0,1,0,0,1,0,0,1,0
291,56,1,155.0,342.0,1.0,150.0,1.0,3.0,1,0,0,0,1,1,0,0
292,58,0,180.0,393.0,0.0,110.0,1.0,1.0,1,0,1,0,0,1,0,0
