In [88]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [89]:
# 
# There are two files we want to look at for this. one is called "heart.csv" and the other is called "o2Saturation.csv".
# To practice using SQL, I want to try putting them into an SQL database using SQL Alchemy.
# 
##########################################
#
# from sqlalchemy import create_engine

# engine = create_engine(f"postgresql:///{gallagher})

In [90]:
heart_df = pd.read_csv('../Resources/Files/heart.csv')

In [91]:
o2saturation_df = pd.read_csv('../Resources/Files/o2Saturation.csv')
main_df = heart_df.join(o2saturation_df, how='outer') #using join because it merges on index by default
main_df.rename(columns = {'98.6':'O2 Saturation'}, inplace = True)

In [92]:
main_df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,O2 Saturation
0,63.0,1.0,3.0,145.0,233.0,1.0,0.0,150.0,0.0,2.3,0.0,0.0,1.0,1.0,98.6
1,37.0,1.0,2.0,130.0,250.0,0.0,1.0,187.0,0.0,3.5,0.0,0.0,2.0,1.0,98.6
2,41.0,0.0,1.0,130.0,204.0,0.0,0.0,172.0,0.0,1.4,2.0,0.0,2.0,1.0,98.6
3,56.0,1.0,1.0,120.0,236.0,0.0,1.0,178.0,0.0,0.8,2.0,0.0,2.0,1.0,98.1
4,57.0,0.0,0.0,120.0,354.0,0.0,1.0,163.0,1.0,0.6,2.0,0.0,2.0,1.0,97.5


In [93]:
# datapane is a cool python library that creates really nice dashboards with just a few lines of code
import datapane as dp

dp.login('a3a9a87fbde3a5303484dd8c52c805c4f45b814d')

table = dp.DataTable(main_df)

dp.Report(table).publish(name = 'Heart Attack Data', open=False)

Connected successfully to https://datapane.com as christian6


Publishing report and associated data - please wait..

ConnectionError: ('Connection aborted.', OSError("(54, 'ECONNRESET')"))

In [None]:
# Output column appears to be whether or not the patient in question resulted in a heart attack. We can use that
# as our y set, the x set can be everything else.

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from warnings import simplefilter
simplefilter(action = 'ignore', category = FutureWarning)

main_df = main_df[:300]
X = main_df.drop(columns=['output'])
y = main_df['output']
X_train, X_test = X[:275], X[275:]
y_train, y_test = y[:275], y[275:]

In [None]:
len(y_test), len(X_test)

In [None]:
# for transformation purposes we need to identify and columns that need to be one hot encoded.
# by doing it this way we sort of automate the process, in which case we can recycle this code if we want
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if (X[col].dtype == 'int64' or X[col].dtype == 'float64')]

In [94]:
# in case there are missing values that need to be fudged
numerical_transformer = SimpleImputer() 
# for categorical we use a pipeline to first fill missing values, and then we one hot encode the categoricals.
categorical_transformer = Pipeline(steps = 
                                  [('imputer', SimpleImputer(strategy = 'most_frequent')),
                                  ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

# creating a general preprocessor function to call during a pipeline
preprocessor = ColumnTransformer(transformers = 
                                [('num', numerical_transformer, numerical_cols),
                                ('cat', categorical_transformer, categorical_cols)])

model = GradientBoostingClassifier(random_state = 0)

# here we create a pipeline to preprocess and then build the model.
pipeline = Pipeline(steps = 
                   [('preprocess', preprocessor),
                   ('model', model)])

# now we use the above pipeline in a grid search to find the best parameters
grid = GridSearchCV(pipeline,
                   param_grid = {'model__n_estimators':[500, 1000, 2000, 3000],
                                'model__learning_rate':[0.01, 0.05, 0.1],
                                },
                   cv = 10,
                   scoring = 'neg_mean_absolute_error')

grid.fit(X, y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                        

In [100]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {-1 * grid.best_score_}")

# as we can see, score is not awesome, only 17% accuracy

Best model parameters: {'model__learning_rate': 0.01, 'model__n_estimators': 500}
Best score: 0.17333333333333334


In [96]:
model_best = GradientBoostingClassifier(n_estimators=500, 
                                 learning_rate=0.01,
                                 random_state=0).fit(X_train, y_train)

model_best_features_importance = model_best.feature_importances_

model_best_features_importance

array([0.02997524, 0.02223862, 0.25116839, 0.02466998, 0.07031415,
       0.0014439 , 0.0162506 , 0.06078856, 0.02780892, 0.10268222,
       0.05230098, 0.13504736, 0.18694889, 0.01836219])

In [97]:
# a big part of prediction and ML is figuring out what features are most important. Can graph these features
# alongside the results to find correlation, etc

features_importance_df = pd.DataFrame(model_best_features_importance, index=X.columns,
                     columns=['Importance']).sort_values(by = 'Importance', ascending = False)
features_importance_df

Unnamed: 0,Importance
cp,0.251168
thall,0.186949
caa,0.135047
oldpeak,0.102682
chol,0.070314
thalachh,0.060789
slp,0.052301
age,0.029975
exng,0.027809
trtbps,0.02467


In [98]:
# save test predictions to file
predictions = grid.predict(X_test)
output = pd.DataFrame({'Id': X_test.index, 'Outcome': predictions})
output.to_csv('../Resources/Files/submission.csv', index=False)

In [99]:
output

Unnamed: 0,Id,Outcome
0,275,0.0
1,276,0.0
2,277,0.0
3,278,0.0
4,279,0.0
5,280,0.0
6,281,0.0
7,282,0.0
8,283,0.0
9,284,0.0
