# Lab 7: Heart Attack

### The Data

In [9]:
import pandas as pd
import numpy as np
from plotnine import *
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

import warnings
from sklearn.exceptions import ConvergenceWarning

In [29]:
ha = pd.read_csv("https://www.dropbox.com/s/aohbr6yb9ifmc8w/heart_attack.csv?dl=1")

#No missing values, might check for outliers
ha

Unnamed: 0,age,sex,cp,trtbps,chol,restecg,thalach,output
0,63,1,3,145,233,0,150,1
1,37,1,2,130,250,1,187,1
2,56,1,1,120,236,1,178,1
3,57,0,0,120,354,1,163,1
4,57,1,0,140,192,1,148,1
...,...,...,...,...,...,...,...,...
268,59,1,0,164,176,0,90,0
269,57,0,0,140,241,1,123,0
270,45,1,3,110,264,1,132,0
271,68,1,0,144,193,1,141,0


In [30]:
ha['output'] = pd.Categorical(ha['output'], categories=[0, 1], ordered=True)
ha['cp'] = pd.Categorical(ha['cp'], categories=[0, 1, 2, 3], ordered=True)

In [23]:
# Mapping for 'sex'
ha['sex'] = ha['sex'].map({'female': 0, 'male': 1})

# Mapping for 'cp'
cp_mapping = {
    1: 'typical angina',
    2: 'atypical angina',
    3: 'non-anginal pain',
    4: 'asymptomatic'
}
ha['cp'] = ha['cp'].map(cp_mapping)

# Mapping for 'restecg'
restecg_mapping = {
    0: 'normal',
    1: 'ST-T wave abnormality',
    2: 'probable/definite left ventricular hypertrophy'
}
ha['restecg'] = ha['restecg'].map(restecg_mapping)

# Mapping for 'output'
output_mapping = {
    0: 'not at risk',
    1: 'at risk'
}
ha['output'] = ha['output'].map(output_mapping)

In [14]:
# Assuming 'trtbps' is a numerical column with outliers
Q1 = ha['trtbps'].quantile(0.25)
Q3 = ha['trtbps'].quantile(0.75)
IQR = Q3 - Q1

outliers = ((ha['trtbps'] < (Q1 - 1.5 * IQR)) | (ha['trtbps'] > (Q3 + 1.5 * IQR)))

# Display rows with outliers
print(ha[outliers])

     age  sex  cp  trtbps  chol  restecg  thalach  output
7     52    1   2     172   199        1      162       1
87    59    1   3     178   270        0      145       1
93    64    0   0     180   325        1      154       1
182   68    1   2     180   274        0      150       0
198   56    0   0     200   288        0      133       0
216   59    0   0     174   249        1      143       0
223   54    1   1     192   283        0      195       0
232   66    0   0     178   228        1      165       0
238   55    0   0     180   327        2      117       0


In [15]:
Q1 = ha['chol'].quantile(0.25)
Q3 = ha['chol'].quantile(0.75)
IQR = Q3 - Q1

outliers_chol = ((ha['chol'] < (Q1 - 1.5 * IQR)) | (ha['chol'] > (Q3 + 1.5 * IQR)))

# Display rows with outliers
print(ha[outliers_chol])

     age  sex  cp  trtbps  chol  restecg  thalach  output
23    65    0   2     140   417        0      157       1
75    67    0   2     115   564        0      160       1
83    62    0   0     140   394        0      157       1
196   63    0   0     150   407        0      154       0
221   56    0   0     134   409        0      150       0


In [16]:
Q1 = ha['thalach'].quantile(0.25)
Q3 = ha['thalach'].quantile(0.75)
IQR = Q3 - Q1

outliers_thalach = ((ha['thalach'] < (Q1 - 1.5 * IQR)) | (ha['thalach'] > (Q3 + 1.5 * IQR)))

# Display rows with outliers
print(ha[outliers_thalach])

     age  sex  cp  trtbps  chol  restecg  thalach  output
244   67    1   0     120   237        1       71       0
