In [None]:
# for data analysis
import pandas as pd 
import numpy as np

# for data visuals
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline 

# for machine learning 
from sklearn.ensemble import RandomForestClassifier as RFclassifier
from sklearn.ensemble import AdaBoostClassifier as ADclassifier
from sklearn.ensemble import BaggingClassifier as BGclassifier
from sklearn.ensemble import ExtraTreesClassifier as ETclassifier
from sklearn.ensemble import GradientBoostingClassifier as GBclassifier
#from sklearn.ensemble import IsolationForest as IFclassifier
#from sklearn.ensemble import RandomTreesEmbedding as RTclassifier
#from sklearn.ensemble import VotingClassifier as VOclassifier

### Step 1: Reading data


In [None]:
trainData = pd.read_csv('processed_cleveland_data_train.csv')
testData = pd.read_csv('processed_cleveland_data_test.csv')


def replace_predict(df):
    df['num'] = df['num'].replace([1, 2, 3, 4, 5, 6], 1)
# replacing anything greater than 1 with 1 because it is a binary classification problem

       
replace_predict(trainData)
replace_predict(testData)



In [None]:

#sns.barplot(x='chol', y='num', palette="rocket", data=trainData)


#### the train and test data sets are further seprated by thier features and the predicted diagnosis

In [None]:

Xtrain = trainData.drop(['num'], axis=1)
Ytrain = trainData['num']

Xtest = testData.drop(['num'], axis=1)
Ytest = testData['num']

Ytrain.head()

### Step 2: Define the model
 

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

# Create all the model

rfModel=RFclassifier()
adModel= ADclassifier()
bgModel=BGclassifier()
etModel=ETclassifier()
gbModel=GBclassifier()
#ifModel=IFclassifier()
# rtModel=RTclassifier()
#voModel= VOclassifier(50)


# Choose parameters for all model
# rfparameters = {'n_estimators': [4,6,9,13,18],
#              'max_features': ['log2', 'sqrt', 'auto'],
#              'criterion': ['entropy', 'gini'],
#              'max_depth': [1,16,32,32,26],
#              'min_samples_split': [2, 3, 5, 8,12],
#              'min_samples_leaf': [1, 2, 8, 10, 15]}
# # adparameters = {'n_estimators': [10,20,50,60,70],'base_estimator':None, 'learning_rate':[.7,1.,1.5],
#                'algorithm'=’SAMME.R’, 'random_state'=None}
# bgparameters = {'n_estimators': [10,20,50,60,70,4,6,9,13,18], 
    



### Step 3: Compile the Model

In [None]:


# # Type of scoring to compare parameter combos 
# acc_scorer = make_scorer(accuracy_score)

# # Run grid search 
# # grid search is an algorthim which automatically finds the best paramters for a particular model
# grid_obj = GridSearchCV(rfModel, rfparameters, scoring=acc_scorer)
# grid_obj = grid_obj.fit(Xtrain, Ytrain)

# # Pick the best combination of parameters
# rfModel = grid_obj.best_estimator_



### Step 4: Fit the Model

In [None]:
# Fit the best algorithm to the data, and decides choosen parameters 
rfModel.fit(Xtrain, Ytrain)
adModel.fit(Xtrain, Ytrain)
bgModel.fit(Xtrain, Ytrain)
etModel.fit(Xtrain, Ytrain)
gbModel.fit(Xtrain, Ytrain)
#ifModel.fit(Xtrain, Ytrain)


### Step 5: Evaluate the Model using accuracy score

In [None]:
finalaccu=[]
models=[rfModel,adModel,bgModel,etModel,gbModel]
count=0
while count<5:
    predictions = models[count].predict(Xtest)
    print(models[count])
    print(predictions)
    accuracy= accuracy_score(Ytest, predictions)
    count=count+1
    finalaccu.append(accuracy)
print(finalaccu)


In [None]:
def logloss(true_label, predicted_prob):
   if true_label == 1:
     return -log(predicted_prob)
   else:
    return -log(1 - predicted_prob)

### Step 5: Evaluate Model using confidence and probability

In [None]:
from sklearn.metrics import log_loss
finalconf=[]
models=[rfModel,adModel,bgModel,etModel,gbModel]
count=0
while count<5:
    confidence = models[count].predict_proba(Xtest)
    probs = confidence[:, 1]
    #print(confidence)
    #print(probs)
    #print(Ytest)
    #print(Ytest.shape)
    #print(Ytest.size)
    # calculate log loss
    loss = log_loss(Ytest.values, probs)
    print(loss)
    #print(models[count])
    #print(confidence) 
    #print(Ytest.values)
    count=count+1

### Step 6: Cross Validation

In [None]:

# Cross Validation with KFold
from sklearn.model_selection import KFold

alldata = pd.read_csv('processed_cleveland_data.csv')
replace_predict(alldata)

Xall = alldata.drop(['num'], axis=1)
Yall = alldata['num']

def run_kfold(model):
    kf = KFold(n_splits=5)
    outcomes = []
    fold = 0
    for train_index, test_index in kf.split(Xall):
        fold += 1
        Xtrain, Xtest = Xall.values[train_index], Xall.values[test_index]
        Ytrain, Ytest = Yall.values[train_index], Yall.values[test_index]
        model.fit(Xtrain, Ytrain)
        #print (Xtrain.shape)
        #print (Ytrain.shape)
        #print(Xtrain[0])
        print(Ytrain[0])
        predictions = model.predict(Xtest)
        print(predictions)
        accuracy = accuracy_score(Ytest, predictions)
        print(accuracy)
        outcomes.append(accuracy)
        print(outcomes)
        print("Fold {0} accuracy: {1}".format(fold, accuracy)) 
        mean_outcome = np.mean(outcomes)
        print("Mean Accuracy: {0}".format(mean_outcome)) 
        
run_kfold(model)
    