Worked on by Andy McRae

# Logistic Regression, Larger Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Loading Cleaned Data

In [None]:
filename = '../../data/large_heart_disease_dataset.csv'
df = pd.read_csv(filename)
df.head()

age: age in years

sex: sex (1 = male; 0 = female)

cp: chest pain type (Value 0: typical angina; Value 1: atypical angina; Value 2: non-anginal pain; Value 3: asymptomatic)

trestbps: resting blood pressure in mm Hg on admission to the hospital

chol: serum cholestoral in mg/dl

fbs: fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

restecg: resting electrocardiographic results (Value 0: normal; Value 1: having ST-T wave abnormality; Value 2: probable or definite left ventricular hypertrophy)

thalach: maximum heart rate achieved

exang: exercise induced angina (1 = yes; 0 = no)

oldpeak: ST depression induced by exercise relative to rest

slope: the slope of the peak exercise ST segment (Value 0: upsloping; Value 1: flat; Value 2: downsloping)

ca: number of major vessels (0-3) colored by flourosopy

thal: thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)

target: heart disease (0 = no, 1 = yes)

## Selecting Features

In [None]:
# selecting features for model
X = df.drop(columns=['target'])

# selecting Output
y = df['target']

## Train-Test-Split

In [None]:
# Performing a train-test-split on the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

## Scaling

In [None]:
# importing scaling
from sklearn.preprocessing import MinMaxScaler

In [None]:
# scaling X's
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Training Model

In [None]:
# create logistic model
model = LogisticRegression()

In [None]:
# fit that model to scaled and output training data
model.fit(X_train_scaled, y_train)

## Quantifying Model

In [None]:
from sklearn.metrics import classification_report

In [None]:
# making predictions for scoring
predictions = model.predict(X_test_scaled)

In [None]:
print(f'Training Score: {model.score(X_train_scaled, y_train)}')
print(f'Testing Score: {model.score(X_test_scaled, y_test)}')

# Looking at a classification report
print(classification_report(y_test, predictions,
                            target_names=["No Heart Disease", "Heart Disease"]))

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# creating grid
# idea for what params to use gotten from here
# https://towardsdatascience.com/grid-search-for-model-tuning-3319b259367e
param_grid = {'C': [1, 5, 10, 50],
              'penalty': ['l1','l2']}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
# List the best parameters for this dataset
print(grid.best_params_)

In [None]:
# List the best score
print(grid.best_score_)