# Import Packages

In [1]:
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold

# Import Data

In [3]:
X = pd.read_csv('Data/train_values.csv', index_col='patient_id')

In [4]:
y = pd.read_csv('Data/train_labels.csv', index_col='patient_id')['heart_disease_present']


In [5]:
y.value_counts()

0    100
1     80
Name: heart_disease_present, dtype: int64

In [6]:
X_new = X
X_new['heart_disease_present'] = y
X_new.head()

Unnamed: 0_level_0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_disease_present
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,0
ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,1
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,0


# EDA

In [7]:
numerical_features = ['slope_of_peak_exercise_st_segment', 
                      'resting_blood_pressure', 
                      'num_major_vessels',
                      'fasting_blood_sugar_gt_120_mg_per_dl',
                      'serum_cholesterol_mg_per_dl',
                      'oldpeak_eq_st_depression',
                      'age',
                      'max_heart_rate_achieved']

categorical_features = ['thal',
                        'chest_pain_type', 
                        'resting_ekg_results']

binary_features = ['sex',
                   'exercise_induced_angina']



In [13]:
col_names = numerical_features 
col_names.append('heart_disease_present')
col_names

['slope_of_peak_exercise_st_segment',
 'resting_blood_pressure',
 'num_major_vessels',
 'fasting_blood_sugar_gt_120_mg_per_dl',
 'serum_cholesterol_mg_per_dl',
 'oldpeak_eq_st_depression',
 'age',
 'max_heart_rate_achieved',
 'heart_disease_present']

In [14]:
X_new[col_names].corr()

Unnamed: 0,slope_of_peak_exercise_st_segment,resting_blood_pressure,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,age,max_heart_rate_achieved,heart_disease_present
slope_of_peak_exercise_st_segment,1.0,0.098287,0.076832,0.050199,-0.032348,0.615948,0.169918,-0.418102,0.344224
resting_blood_pressure,0.098287,1.0,0.042388,0.16657,0.144881,0.219026,0.284402,-0.017521,0.078506
num_major_vessels,0.076832,0.042388,1.0,0.169792,0.098348,0.214062,0.347355,-0.275687,0.421519
fasting_blood_sugar_gt_120_mg_per_dl,0.050199,0.16657,0.169792,1.0,0.02756,-0.039055,0.176101,0.058369,0.003379
serum_cholesterol_mg_per_dl,-0.032348,0.144881,0.098348,0.02756,1.0,-0.021932,0.236211,-0.071038,0.079775
oldpeak_eq_st_depression,0.615948,0.219026,0.214062,-0.039055,-0.021932,1.0,0.1897,-0.341045,0.38293
age,0.169918,0.284402,0.347355,0.176101,0.236211,0.1897,1.0,-0.39463,0.138255
max_heart_rate_achieved,-0.418102,-0.017521,-0.275687,0.058369,-0.071038,-0.341045,-0.39463,1.0,-0.375352
heart_disease_present,0.344224,0.078506,0.421519,0.003379,0.079775,0.38293,0.138255,-0.375352,1.0


In [None]:
plt.matshow(X.corr())
plt.xticks(range(X.shape[1]), X.columns, rotation=90)
plt.yticks(range(X.shape[1]), X.columns)
plt.colorbar()
plt.show();

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=len(numerical_features), figsize=(10, 70))

for column in numerical_features:
    ax_num = list(X[numerical_features]).index(column)
    sns.violinplot(y, X[column], ax=ax[ax_num])
    
plt.tight_layout()

In [None]:
y[:3]

In [None]:
sex_idx = X[X['sex'] == 0].index

In [None]:
#y[sex_idx]

In [None]:
X.groupby('sex').get_group(0).head()

# Transformers

In [46]:
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return X[self.columns]
    
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.labelers = {col: LabelEncoder().fit(X[col]) for col in X}
        return self
    
    def transform(self, X):
        return pd.DataFrame({col: self.labelers[col].transform(X[col])
                            for col in X})

In [None]:
list(X.columns)

In [47]:
class CustomOneHot(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.columns = list(X.columns)
        return self
    
    def transform(self, X):
        return pd.get_dummies(X, columns = self.columns)

In [None]:
180*0.05/3

In [48]:
from sklearn.ensemble import IsolationForest


class CustomOutlier(BaseEstimator, TransformerMixin):
    def __init__(self, estimators=100, contam = 0.2):
        self.estimators = estimators
        self.contam = contam
        
    def fit(self, X, y):
        clf = IsolationForest(n_estimators=self.estimators, contamination=self.contam)
        clf.fit(X)
        self.y_pred = clf.predict(X)
        self.y = y[self.y_pred==1 ]
        return self
    
    def transform(self, X):
        return self.y, X.loc[X.index[self.y_pred==1]]

In [None]:
cc = CustomOutlier()
yy, XX = cc.fit_transform(X[numerical_features], y)


In [None]:
yy.head()

# Attempt 1: Logistic Regression

In [None]:
# For categorical features
cat_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(categorical_features)),
    ('cle', CustomLabelEncoder()),
    ('ohe', OneHotEncoder(sparse=False))
])

# For features we don't want to transform
passthrough_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(numerical_features + binary_features))
])

feat_u = FeatureUnion([
    ('cat_pipe', cat_pipe),
    ('passthrough_pipe', passthrough_pipe)
])

full_model = Pipeline([
    ('feat_u', feat_u),
    ('lr', LogisticRegression())
])

full_model.fit(X, y)

In [None]:
X_test = pd.read_csv('data_heart/test_values.csv', index_col='patient_id')
submission = pd.read_csv('data_heart/submission_format.csv')

y_pred = full_model.predict_proba(X_test)

sum(y_pred[0])
# submission
# print('ypred')
# print(y_pred)
# submission.heart_disease_present = y_pred

# print('subm')
# print(submission)
#submission.to_csv('2019-08-14_submission.csv', index=False)

# Attempt 2: Logistic with PCA

In [None]:
# For categorical features
cat_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(categorical_features)),
    ('cle', CustomLabelEncoder()),
    ('ohe', OneHotEncoder(sparse=False))
])

# For features we don't want to transform
passthrough_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(numerical_features + binary_features))
])

feat_u = FeatureUnion([
    ('cat_pipe', cat_pipe),
    ('passthrough_pipe', passthrough_pipe)
])

full_model = Pipeline([
    ('feat_u', feat_u),
    ('pca', PCA(n_components=4)),
    ('lr', LogisticRegression())
])

full_model.fit(X, y)

In [None]:
full_model.named_steps['pca'].explained_variance_ratio_

In [None]:
X_test = pd.read_csv('data/test_values.csv', index_col='patient_id')
submission = pd.read_csv('data/submission_format.csv')

y_pred = full_model.predict_proba(X_test)
submission.heart_disease_present = y_pred

submission.to_csv('2019-08-14a_submission.csv', index=False)

# Logistic Regression, reduced features

In [None]:
simple_lr = LogisticRegression()

In [None]:
reduced_cols = ['num_major_vessels', 
                 'slope_of_peak_exercise_st_segment',
                 'max_heart_rate_achieved']

simple_lr.fit(X[reduced_cols],
             y)

y_pred = simple_lr.predict(X[reduced_cols])
log_loss(y, y_pred, labels = None)

In [None]:
X.columns

In [None]:
param_grid = {'max_depth': range(3,10),
              'min_samples_split': range(3,11,2),
              'min_samples_leaf': range(3,11,2)}

gs = GridSearchCV(RandomForestClassifier(),
                  param_grid=param_grid,
                  n_jobs=3,
                  cv=3,
                  verbose=1)


In [None]:
gs.fit(X, y)

In [None]:
gs.best_params_

In [None]:
y_pred = gs.predict(X)

In [None]:
log_loss(y, y_pred, labels = None)

# Attempt 3, Logistic with PCA, Grid Search, Standard Scaler

In [None]:
passthrough_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(numerical_features)),
    ('col', CustomOutlier())
])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold

numerical_features = [
                    'slope_of_peak_exercise_st_segment', 
                      'resting_blood_pressure', 
                      'num_major_vessels',
                      'fasting_blood_sugar_gt_120_mg_per_dl',
                      'serum_cholesterol_mg_per_dl',
                      'oldpeak_eq_st_depression',
                      #'age',
                      'max_heart_rate_achieved']

categorical_features = [#'thal',
                        'chest_pain_type'
                        #, 
                        #'resting_ekg_results'
                        ]

binary_features = [
                    #'sex'
                   'exercise_induced_angina']


# For categorical features

# cat_pipe = Pipeline([
#      ('cst', ColumnSelectTransformer(categorical_features)),
#      ('cle', CustomOneHot())
#      ])
    
# For features we don't want to transform
passthrough_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(numerical_features))
   
])

feat_u = FeatureUnion([
    #('cat_pipe', cat_pipe)
    #,
    ('passthrough_pipe', passthrough_pipe)
])

Xt = feat_u.fit_transform(X)

model = NMF( init='random', random_state=0)
model1 = TruncatedSVD(random_state=0)
model3 = PCA()
# feat_u2 = FeatureUnion([
#     ('dimred', model),
#     ('mod2', SelectKBest (chi2, k = 1))
# ])

full_model = Pipeline([
  #  ('sc', StandardScaler()),
   # ('dimred', model3),
    ('lr',  SVC(probability = True))
    
])

param_grid = [
    {
              #'dimred__n_components' : range(3,6),
              'lr__C': np.logspace(-8, 3, 20),
              'lr__kernel' : ['rbf'],
              'lr__gamma' : np.logspace(-8, 3, 20)
              
              }
#     ,
#               {
#               'dimred__n_components' : range(3,7),
#               'lr__C': np.logspace(-3, 3, 20),
#               'lr__kernel' : ['poly'],
#               'lr__degree' : range(3, 5)
              
#               }
              ]


# full_model = Pipeline([
#     ('dimred', model3),
#     ('lr', LogisticRegression())
# ])

# param_grid = {
              
#               'lr__C': np.logspace(-3, 3, 20),
#               'lr__fit_intercept' : [True, False]}

#skf = StratifiedKFold(n_splits=2)

gs = GridSearchCV(full_model,
                  param_grid=param_grid,
                  scoring = 'neg_log_loss',
                  n_jobs=3,
                  cv=2,
                  verbose=1)

gs.fit(Xt,y.values)
gs.best_score_

In [None]:
gs.best_score_

In [None]:
X_test = pd.read_csv('data_heart/test_values.csv', index_col='patient_id')
submission = pd.read_csv('data_heart/submission_format.csv')
Xtt = feat_u.fit_transform(X_test)
y_pred = gs.predict_proba(Xtt)[:,1]
submission.heart_disease_present = y_pred

submission.to_csv('2019-08-21_submission.csv', index=False)

# Attempt 4.  Model fitting based on Categorical Variable ( sex )

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
X = pd.read_csv('data_heart/train_values.csv', index_col='patient_id')
y = pd.read_csv('data_heart/train_labels.csv', index_col='patient_id')['heart_disease_present']
X_test = pd.read_csv('data_heart/test_values.csv', index_col='patient_id')

X_male = X[X['sex'] == 0]

X_test_male = X_test[X_test['sex'] == 0]

sex_idx_male = X_male.index
y_male = y[sex_idx_male]
 

In [None]:
def predict_probs(X_train, y_train, X_test):
    
    numerical_features = [
                        'slope_of_peak_exercise_st_segment', 
                        #  'resting_blood_pressure', 
                          'num_major_vessels',
                          'fasting_blood_sugar_gt_120_mg_per_dl',
                          'serum_cholesterol_mg_per_dl',
                          'oldpeak_eq_st_depression',
                          #'age',
                          'max_heart_rate_achieved'
                         ]

    categorical_features = ['thal',
                            'chest_pain_type'
                            , 
                            'resting_ekg_results'
                            ]

    binary_features = [
                        'exercise_induced_angina'
                    ]

    #For categorical features
    cat_pipe = Pipeline([
         ('cst', ColumnSelectTransformer(categorical_features)),
         ('cle', CustomOneHot())
     ])

    # For features we don't want to transform
    passthrough_pipe = Pipeline([
        ('cst', ColumnSelectTransformer(numerical_features + binary_features))
    ])

    feat_u = FeatureUnion([
        ('cat_pipe', cat_pipe),
        ('passthrough_pipe', passthrough_pipe)
    ])

    Xt = feat_u.fit_transform(X_train)

    model = NMF( init='random', random_state=0)
    model1 = TruncatedSVD( random_state=0)
    model2 = PCA()

    cl_dt = DecisionTreeClassifier(max_depth=7, random_state=0)
    clf = AdaBoostClassifier(cl_dt, random_state=0)




    full_model = Pipeline([
        #('sc', StandardScaler()),
        #('feat', model2),
        #('lr', SVC(probability = True, random_state = 0))
        #('lr', LogisticRegression())
        ('lr', clf)

    ])


    # param_grid = [{ 
    #                 #'feat__n_components' : range(2,6),

    #               'lr__C': np.logspace(-8, 3, 20)
    #               ,
    #               'lr__kernel':   ['rbf'],
    #               'lr__gamma' : np.logspace(-8, 3, 20)
    #               }
    #               {

    #                   'lr__C': np.logspace(-3, 3, 20),
    #                   'lr__kernel' : ['poly'],
    #                   'lr__degree' : range(3,10)
    #               }
    #              ]


    param_grid = [{
                    'lr__learning_rate': np.logspace(-10,0, 20),
                    'lr__n_estimators' : [7]

                    }]
    gs = GridSearchCV(full_model,
                      param_grid=param_grid,
                      scoring = 'neg_log_loss',
                      n_jobs=3,
                      cv=2,

                      verbose=1)

    gs.fit(Xt,y_train.values)
    gs.best_score_
    
    Xtt = feat_u.fit_transform(X_test)
    y_pred = gs.predict_proba(Xtt)[:,1]
    
    return gs.best_score_, y_pred

In [None]:
score, y_prob = predict_probs(X, y, X_test)

print('score = ', score)
print('probs = ', y_prob)

In [None]:
submission = pd.read_csv('data_heart/submission_format.csv')
submission.heart_disease_present = y_prob
submission.to_csv('2019-08-23a_submission.csv', index=False)

In [None]:
X_test = pd.read_csv('data_heart/test_values.csv', index_col='patient_id')
submission = pd.read_csv('data_heart/submission_format.csv')
Xtt = feat_u.fit_transform(X_test)
y_pred = gs.predict_proba(Xtt)[:,1]
submission.heart_disease_present = y_pred

submission.to_csv('2019-08-21_submission.csv', index=False)

In [None]:
gs.best_params_

# Attempt 5: Anomaly

In [None]:
X_test = pd.read_csv('data_heart/test_values.csv', index_col='patient_id')

In [None]:
# passthrough_pipe = Pipeline([
#     ('cst', ColumnSelectTransformer(numerical_features))
   
# ])
# Xt = passthrough_pipe.fit_transform(X)
# Xt

In [None]:
from sklearn.linear_model import Ridge
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold

numerical_features = [
                    'slope_of_peak_exercise_st_segment', 
                      'resting_blood_pressure', 
                      'num_major_vessels',
                      'fasting_blood_sugar_gt_120_mg_per_dl',
                      'serum_cholesterol_mg_per_dl',
                      'oldpeak_eq_st_depression',
                      'age',
                      'max_heart_rate_achieved'
]

categorical_features = [#'thal',
                        'chest_pain_type'
                        #, 
                        #'resting_ekg_results'
                        ]

binary_features = [
                    #'sex'
                   'exercise_induced_angina']


# For categorical features

# cat_pipe = Pipeline([
#      ('cst', ColumnSelectTransformer(categorical_features)),
#      ('cle', CustomOneHot())
#      ])
    
# For features we don't want to transform
passthrough_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(numerical_features))
   
])

feat_u = FeatureUnion([
    #('cat_pipe', cat_pipe)
    #,
    ('passthrough_pipe', passthrough_pipe)
])

Xt = passthrough_pipe.fit_transform(X)
cc = CustomOutlier(200, 0.1/9)
yy, Xt = cc.fit_transform(Xt, y)


model = NMF( init='random', random_state=0)
model1 = TruncatedSVD(random_state=0)
model3 = PCA()
# feat_u2 = FeatureUnion([
#     ('dimred', model),
#     ('mod2', SelectKBest (chi2, k = 1))
# ])

full_model = Pipeline([
  #  ('sc', StandardScaler()),
   # ('dimred', model3),
    ('lr',  SVC(probability = True))
    
])

param_grid = [
    {
              #'dimred__n_components' : range(3,6),
              'lr__C': np.logspace(-8, 3, 20),
              'lr__kernel' : ['rbf'],
              'lr__gamma' : np.logspace(-8, 3, 20)
              
              }
#     ,
#               {
#               'dimred__n_components' : range(3,7),
#               'lr__C': np.logspace(-3, 3, 20),
#               'lr__kernel' : ['poly'],
#               'lr__degree' : range(3, 5)
              
#               }
              ]


# full_model = Pipeline([
#     ('dimred', model3),
#     ('lr', LogisticRegression())
# ])

# param_grid = {
              
#               'lr__C': np.logspace(-3, 3, 20),
#               'lr__fit_intercept' : [True, False]}

#skf = StratifiedKFold(n_splits=2)

gs = GridSearchCV(full_model,
                  param_grid=param_grid,
                  scoring = 'neg_log_loss',
                  n_jobs=3,
                  cv=2,
                  verbose=1)

gs.fit(Xt,yy.values)
print('score' , gs.best_score_)


Xtt = passthrough_pipe.fit_transform(X_test)
y_pred_prob = gs.predict_proba(Xtt)[:,1]

submission = pd.read_csv('data_heart/submission_format.csv')
submission.heart_disease_present = y_pred_prob
submission.to_csv('2019-08-25_submission.csv', index=False)

# Attempt 6:  Random Forest

In [3]:
X = pd.read_csv('data_heart/train_values.csv', index_col='patient_id')
y = pd.read_csv('data_heart/train_labels.csv', index_col='patient_id')['heart_disease_present']
X_test = pd.read_csv('data_heart/test_values.csv', index_col='patient_id')

X_male = X[X['sex'] == 0]
sex_idx_male = X_male.index
y_male = y[sex_idx_male]
X_female = X[X['sex'] == 1]
sex_idx_female = X_female.index
y_female = y[sex_idx_female]

X_test_male = X_test[X_test['sex'] == 0]
sex_test_idx_male = X_test_male.index 
y_male_test = pd.DataFrame({'heart_disease_present': np.zeros(X_test_male.shape[0])}, index = sex_test_idx_male)
X_test_female = X_test[X_test['sex'] == 1]
sex_test_idx_female = X_test_female.index 
y_female_test = pd.DataFrame({'heart_disease_present': np.zeros(X_test_female.shape[0])}, index = sex_test_idx_female)


In [None]:
X_male.shape

In [11]:
X1 = X_female.copy()

In [42]:
X1['heart_d'] = y_female.values
X1.head()

Unnamed: 0_level_0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_d
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,1
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,0
ldukkw,1,normal,130,3,0,0,0,180,0.0,1,42,150,0,0


In [40]:
X1[X1['heart_d']==1]['num_major_vessels'].value_counts()

0    25
1    21
2    14
3     9
Name: num_major_vessels, dtype: int64

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test1, y_train, y_test1 = train_test_split(X_female, y_female, test_size=0.2, random_state=42)

In [None]:
# co = CustomOutlier(contam = 0.25)
# yy, XX = co.fit_transform(X_female[numerical_features], y_female)
# X_female = X_female.loc[XX.index,:]
# y_female = yy

In [None]:
len(X_train.columns)

In [None]:
X['resting_ekg_results'].unique()

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier

numerical_features = [
                    #'slope_of_peak_exercise_st_segment', 
                     'resting_blood_pressure', 
                      'num_major_vessels',
                      #'fasting_blood_sugar_gt_120_mg_per_dl',
                      #'serum_cholesterol_mg_per_dl',
                      #'oldpeak_eq_st_depression',
                      'age',
                      #'max_heart_rate_achieved'
]

categorical_features = [#'thal',
                        'chest_pain_type'
                       # , 
                       # 'resting_ekg_results'
                        ]

binary_features = [
                    'sex'
    #,
                  # 'exercise_induced_angina'
]


# For categorical features

cat_pipe = Pipeline([
      ('cst', ColumnSelectTransformer(categorical_features)),
      ('cle', CustomOneHot())
      ])
    
# For features we don't want to transform
passthrough_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(numerical_features))
   
])

feat_u = FeatureUnion([
    ('cat_pipe', cat_pipe)
    ,
    ('passthrough_pipe', passthrough_pipe)
])

Xt = feat_u.fit_transform(X_train)
print(Xt.shape)
#model = NMF(n_components = 4, init='random', random_state=0)
#model1 = TruncatedSVD(random_state=0)
#model3 = PCA()
model4 = RandomForestClassifier(class_weight='balanced')
model5 = GradientBoostingClassifier()

full_model = Pipeline([
  #  ('sc', StandardScaler()),
   # ('dimred', model),
    ('lr',  model4)
    
])

param_grid =  {
              #'dimred__n_components' : range(3,6),
              'lr__n_estimators': range(100, 301, 100),
              'lr__max_depth' : range(2,7),
              'lr__min_samples_split': range(2,7),
              'lr__min_samples_leaf' :range(2,7)
              
              }

              



# gs = RandomizedSearchCV(estimator=full_model,
#                   param_distributions=param_grid,
#                   n_iter = 250,
#                   scoring = 'neg_log_loss',
#                   n_jobs=5,
#                   cv=2,
#                   verbose=1)

gs = full_model

gs.fit(Xt,y_train.values)
#gs.best_score_



(99, 7)


Pipeline(memory=None,
     steps=[('lr', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [None]:
AA = cat_pipe.fit_transform(X_train)
AA.shape

In [None]:
X_train[categorical_features]

In [None]:
Xtt = feat_u.fit_transform(X_test1)
y_pred = gs.predict_proba(Xtt)[:,1]

from sklearn.metrics import log_loss
rr = log_loss(y_test1, y_pred)
rr

In [None]:
X_female.head()

In [None]:
X_male['resting_ekg_results'].value_counts()

In [None]:
X_female[categorical_features]

In [None]:
X.shape

In [None]:
Xt.shape

In [None]:
gs.best_params_

In [None]:
Xtt = feat_u.fit_transform(X_test_male)
y_pred = gs.predict_proba(Xtt)[:,1]
y_pred.shape
y_male_test['heart_disease_present'] = y_pred

In [None]:
Xtt = feat_u.fit_transform(X_test_female)
y_pred = gs.predict_proba(Xtt)[:,1]
y_pred.shape
y_female_test['heart_disease_present'] = y_pred

In [None]:
y_male_test

In [None]:
y_probs = pd.concat([y_male_test, y_female_test])

In [None]:
y_probs

In [None]:
submission = pd.read_csv('data_heart/submission_format.csv')
submission.heart_disease_present = y_pred
submission.to_csv('2019-08-21_submission.csv', index=False)

In [None]:
from sklearn.metrics import log_loss
y_pred = gs.predict_proba(Xt)[:,1]

log_loss(y, y_pred)

## Test Values

In [None]:
X_test = pd.read_csv('data/test_values.csv', index_col='patient_id')

In [None]:
y_ans = simple_lr.predict(X_test[reduced_cols])