In [None]:
# for data analysis
import pandas as pd 
import numpy as np

# for data visuals
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline 

# for machine learning 
from sklearn.ensemble import RandomForestClassifier as RFclassifier


### Step 1: Reading data


In [None]:
trainData = pd.read_csv('processed_cleveland_data_train.csv')
testData = pd.read_csv('processed_cleveland_data_test.csv')

# replacing anything greater than 1 with 1 because it is a binary classification problem
def replace_predict(df):
    df['num'] = df['num'].replace([1, 2, 3, 4, 5, 6], 1)


       
replace_predict(trainData)
replace_predict(testData)



In [None]:

sns.barplot(x='chol', y='num', palette="rocket", data=trainData)


#### the train and test data sets are further seprated by thier features and the predicted diagnosis

In [None]:

Xtrain = trainData.drop(['num'], axis=1)
Ytrain = trainData['num']

Xtest = testData.drop(['num'], axis=1)
Ytest = testData['num']

Ytrain.head()

### Step 2: Define the model
 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

# Create model

model = RFclassifier()

# Choose Parameters
parameters = {'n_estimators': [4,6,9,13,18],
             'max_features': ['log2', 'sqrt', 'auto'],
             'criterion': ['entropy', 'gini'],
             'max_depth': [1,16,32,32,26],
             'min_samples_split': [2, 3, 5, 8,12],
             'min_samples_leaf': [1, 2, 8, 10, 15]}



### Step 3: Compile the Model and GridSearch

In [None]:


# Type of scoring to compare parameter combos 
acc_scorer = make_scorer(accuracy_score)

# Run grid search 
# grid search is an algorthim which automatically finds the best paramters for a particular model
# grid search also decided which features to choose for tree
grid_obj = GridSearchCV(model, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(Xtrain, Ytrain)

# Pick the best combination of parameters
model = grid_obj.best_estimator_




### Step 4: Fit the Model

In [None]:
# Fit the best algorithm to the data, and decides choosen parameters 
model.fit(Xtrain, Ytrain)

### Step 5: Evaluate the Model

In [None]:
predictions = model.predict(Xtest)
accuracy= accuracy_score(Ytest, predictions)
print(accuracy)



### Step 6: Cross Validation

In [None]:

# Cross Validation with KFold
from sklearn.model_selection import KFold

alldata = pd.read_csv('processed_cleveland_data.csv')
replace_predict(alldata)

Xall = alldata.drop(['num'], axis=1)
Yall = alldata['num']

def run_kfold(model):
    kf = KFold(n_splits=5)
    outcomes = []
    fold = 0
    for train_index, test_index in kf.split(Xall):
        fold += 1
        Xtrain, Xtest = Xall.values[train_index], Xall.values[test_index]
        Ytrain, Ytest = Yall.values[train_index], Yall.values[test_index]
        model.fit(Xtrain, Ytrain)
        print(Ytrain[0])
        predictions = model.predict(Xtest)
        print(predictions)
        accuracy = accuracy_score(Ytest, predictions)
        print(accuracy)
        outcomes.append(accuracy)
        print(outcomes)
        print("Fold {0} accuracy: {1}".format(fold, accuracy)) 
        mean_outcome = np.mean(outcomes)
        print("Mean Accuracy: {0}".format(mean_outcome)) 
        
run_kfold(model)
    

### Step 7: Save Model

In [None]:
import pickle
filename = '/Users/Sahithi/HeartModels/HeartDiseaseML.pickle'
pickle.dump(model, open(filename, 'wb'))