In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DATA_DIR = Path('..', 'data', 'final', 'public')

DATA_DIR = Path('..', 'data', 'final', 'public')

In [None]:
# for training our model
train_values = pd.read_csv('../input/warm-up-machine-learning-with-a-heart/train_values.csv', index_col='patient_id')
train_labels = pd.read_csv('../input/warm-up-machine-learning-with-a-heart/train_labels.csv', index_col='patient_id')

In [None]:
train_values.head()

#reference http://drivendata.co/blog/machine-learning-with-a-heart-benchmark/

In [None]:
train_values.dtypes

In [None]:
train_labels.head()

In [None]:
train_labels.heart_disease_present.value_counts().plot.bar(title='Number with Heart Disease')

In [None]:
selected_features = ['age', 
                     'sex', 
                     'max_heart_rate_achieved', 
                     'resting_blood_pressure']
train_values_subset = train_values[selected_features]

In [None]:
sns.pairplot(train_values.join(train_labels), 
             hue='heart_disease_present', 
             vars=selected_features)

In [None]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.linear_model import LogisticRegression

# for combining the preprocess with model training
from sklearn.pipeline import Pipeline

# for optimizing parameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pipe = Pipeline(steps=[('scale', StandardScaler()), 
                       ('logistic', LogisticRegression())])
pipe

In [None]:
param_grid = {'logistic__C': [0.0001, 0.001, 0.01, 1, 10], 
              'logistic__penalty': ['l1', 'l2']}
gs = GridSearchCV(estimator=pipe, 
                  param_grid=param_grid, 
                  cv=3)

In [None]:
gs.fit(train_values_subset, train_labels.heart_disease_present)


In [None]:
gs.best_params_

In [None]:
from sklearn.metrics import log_loss

in_sample_preds = gs.predict_proba(train_values[selected_features])
log_loss(train_labels.heart_disease_present, in_sample_preds)

In [None]:
test_values = pd.read_csv('../input/warm-up-machine-learning-with-a-heart/test_values.csv', index_col='patient_id')

In [None]:
test_values_subset = test_values[selected_features]


In [None]:
predictions = gs.predict_proba(test_values_subset)[:, 1]


In [None]:
submission_format = pd.read_csv('../input/format/submission_format.csv', index_col='patient_id')


In [None]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.head()


In [None]:
my_submission.to_csv('../input/solution.csv')