## Part 1.0: Modelling/Training for our Feature Extract Data [(From Other Notebook)](https://www.kaggle.com/danielbozinovski/feature-extraction-for-pneumonia)

In [None]:
# Imports
import os
import cv2
import glob
import time
import pydicom
import skimage
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from skimage import feature, filters
%matplotlib inline

from functools import partial
from collections import defaultdict
from joblib import Parallel, delayed
from lightgbm import LGBMClassifier
from tqdm import tqdm

# sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

sns.set_style('whitegrid')
np.warnings.filterwarnings('ignore')

In [None]:
imageFeaturesPath = "../input/extractedfeatures/dicomImageFeatures.csv"
testImageFeaturesPath = "../input/extractedfeatures/testImageFeatures.csv"
labelsPath = "../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv"

imageFeatures = pd.read_csv(imageFeaturesPath)
testImageFeatures = pd.read_csv(testImageFeaturesPath)
labels = pd.read_csv(labelsPath)

In [None]:
# Imports
from sklearn.model_selection import train_test_split
import sklearn.metrics as skm

In [None]:
imageFeatures.head(2)

## Part 1.1: Get Features into their own Dataframe

In [None]:
def getFeaturesDF(imgFeatures):
    
    features = imgFeatures.features.apply(lambda x: list(eval(x)))

    df = pd.DataFrame(features.values.tolist(), 
                            columns=['mean', 'stddev', 'area', 'perimeter', 'irregularity', 'equiv_diam', 'hu1', 'hu2', 'hu4', 'hu5', 'hu6'],
                            index=imgFeatures.index)

    df['hasPneumonia'] = labels['Target']
    
    return df

In [None]:
# Get train and test features dataframes
trainData = getFeaturesDF(imageFeatures)
testData = getFeaturesDF(testImageFeatures)

## Part 1.2: Get Class Weights

In [None]:
COUNT_NORMAL = len(trainData.loc[trainData['hasPneumonia'] == 0])
COUNT_PNE = len(trainData.loc[trainData['hasPneumonia'] == 1])
TRAIN_IMG_COUNT = len(trainData)

weight_for_0 = (1 / COUNT_NORMAL)*(TRAIN_IMG_COUNT)/2.0 
weight_for_1 = (1 / COUNT_PNE)*(TRAIN_IMG_COUNT)/2.0

classWeight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

## Part 1.3: Normalise the Data

In [None]:
from sklearn.preprocessing import StandardScaler

# For Training Data
trainData.dropna()

# Split data into x and y
x = trainData.drop(columns=['hasPneumonia'])
y = trainData['hasPneumonia']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

In [None]:
# For test data
testData.dropna()

# Split data into x and y
x_test_unseen = testData.drop(columns=['hasPneumonia'])
y_test_unseen = testData['hasPneumonia']

# Scale the features
scaler = StandardScaler()
X_scaled_test_unseen = scaler.fit_transform(x_test_unseen)

## Part 1.4: Split into Training and Testing Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                        X_scaled, 
                                        y,
                                        stratify=y,
                                        shuffle=True, 
                                        test_size = 0.3)

In [None]:
# Create function to print our scoring metrics
def printScores(pred_y_test, y_test, pred_y_train, y_train):
    
    print("===== Training Metrics =====")
    print(f"Accuracy: {round(skm.accuracy_score(y_train, pred_y_train)*100, 3)}%")
    print(f"Precision: {round(skm.precision_score(y_train, pred_y_train)*100, 3)}%")
    print(f"Recall: {round(skm.recall_score(y_train, pred_y_train)*100, 3)}%")
    print(f"MSE: {round(skm.mean_squared_error(y_train, pred_y_train)*100, 3)}%")
    print(f"Area Under Curve: {round(skm.roc_auc_score(y_train, pred_y_train)*100, 3)}%")
    
    print("\n===== Testing Metrics =====")
    print(f"Accuracy: {round(skm.accuracy_score(y_test, pred_y_test)*100, 3)}%")
    print(f"Precision: {round(skm.precision_score(y_test, pred_y_test)*100, 3)}%")
    print(f"Recall: {round(skm.recall_score(y_test, pred_y_test)*100, 3)}%")
    print(f"MSE: {round(skm.mean_squared_error(y_test, pred_y_test)*100, 3)}%")
    print(f"Area Under Curve: {round(skm.roc_auc_score(y_test, pred_y_test)*100, 3)}%")

In [None]:
from sklearn.model_selection import RandomizedSearchCV

## Part 1.5: Perform RandomisedGridSearch For Optimal Hyper-paramters

### Model 1: Logistic Regression

In [None]:
# Performing randomised Grid Search
cVals = list(range(1, 6))
cWeight = [None, 'Balanced']

params = dict(C = cVals, class_weight=cWeight)

logReg = LogisticRegression()
clf = RandomizedSearchCV(logReg, params, random_state=0)

search = clf.fit(X_train, y_train)
search.best_params_ # Return the best hyper-parameters

### Model 2: kNN

In [None]:
# Performing randomised Grid Search
kValues = list(range(10, 210, 10))
weight_options = ["uniform", "distance"]

params = dict(n_neighbors = kValues, weights = weight_options)

kNN = KNeighborsClassifier()
clf = RandomizedSearchCV(kNN, params, random_state=0)

search = clf.fit(X_train, y_train)
search.best_params_ # Return the best hyper-parameters

### Model 3: Naive Bayes

In [None]:
# Did not run randomised grid search since lack of hyper-parameters
gnb = GaussianNB()
gnb.fit(X_train, y_train)

pred_y_test = gnb.predict(X_test)
pred_y_train = gnb.predict(X_train)

printScores(pred_y_test, y_test, pred_y_train, y_train) # Scoring function

### Model 4: Random Forest

In [None]:
# Perform Randomised Grid Search
classWeights = [None, 'Balanced']
nEstimatorValues = list(range(300, 800, 100))
maxDepthValues = list(range(6, 10))
minSamplesSplitValues = list(range(2, 5))

params = dict(n_estimators = nEstimatorValues, 
              max_depth = maxDepthValues, 
              class_weight = classWeights,
              min_samples_split = minSamplesSplitValues)

rfc = RandomForestClassifier(n_jobs=-1)

clf = RandomizedSearchCV(rfc, params, random_state=0)

search = clf.fit(X_train, y_train)
search.best_params_ # Display best hyper-parameters

### Model 5: Support Vector Machine

In [None]:
# Perform Randomised Grid Seach
cVals = np.arange(0.5, 1.6, 0.1)
classWeights = [None, 'Balanced']

params = dict(C = cVals, class_weight = classWeights)

svm = SVC()

clf = RandomizedSearchCV(svm, params, random_state=0)
search = clf.fit(X_train, y_train)
search.best_params_ # Return the best hyper-parametrs

### Model 6: Gradient Boosted Classifier

In [None]:
# Perform Randomised Grid Search to Find Optimal Hyper-parameters
nEstimatorValues = list(range(300, 800, 100))
maxDepthValues = list(range(6, 10))
minSamplesSplitValues = list(range(2, 5))
lrs = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]

params = dict(n_estimators = nEstimatorValues, 
              max_depth = maxDepthValues, 
              learning_rate = lrs,
              min_samples_split = minSamplesSplitValues)


gbc = GradientBoostingClassifier()

clf = RandomizedSearchCV(gbc, params, random_state=0)
search = clf.fit(X_train, y_train)
search.best_params_

## Part 1.6: Create Function to Perform K-Fold Cross Validation

In [None]:
# Function to perform K-fold cross val
def performCV(model, name, K):
    
    print(f"===== Performing CV for {name} =====")
    kfold = KFold(n_splits = K, shuffle = True)
    
    accuracy_per_fold = []
    precision_per_fold = []
    recall_per_fold = []
    mse_per_fold = []
    auc_per_fold = []

    for train_index, test_index in kfold.split(X_scaled):

        X_train, X_test = X_scaled[train_index], X_scaled[test_index] # Split data
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)# Fit data
        pred_y_test = model.predict(X_test) # Make a prediction
        
        accuracy = skm.accuracy_score(y_test, pred_y_test)
        precision = skm.precision_score(y_test, pred_y_test)
        recall = skm.recall_score(y_test, pred_y_test)
        mse = skm.mean_squared_error(y_test, pred_y_test)
        auc = skm.roc_auc_score(y_test, pred_y_test)

        accuracy_per_fold.append(accuracy)
        precision_per_fold.append(precision)
        recall_per_fold.append(recall)
        mse_per_fold.append(mse)
        auc_per_fold.append(auc)
        
    
    return {
        'mean_accuracy': np.mean(accuracy_per_fold),
        'mean_precision': np.mean(precision_per_fold),
        'mean_recall': np.mean(recall_per_fold),
        'mean_mse': np.mean(mse_per_fold),
        'mean_auc': np.mean(auc_per_fold)
    }
    

In [None]:
# Take models with most optimal hyper parameters
logReg = LogisticRegression(C = 1)
kNN = KNeighborsClassifier(150, weights = "distance")
gnb = GaussianNB()
rfc = RandomForestClassifier(n_estimators = 700, max_depth = 9, min_samples_split = 4, n_jobs = -1)
svm = SVC(C = 1.5)
gbc = GradientBoostingClassifier(n_estimators = 700, max_depth = 9, min_samples_split = 4, learning_rate=0.005)

modelsList = [(logReg, "Logistic Regression"), 
             (kNN, "K-Nearest Neighbour"),
             (gnb, "Naive Bayes"),
             (rfc, "Random Forest"),
             (svm, "Support Vector Machine"),
             (gbc, "Gradient Boosting Classifier")]

CVResults = {}

for m in modelsList:
    CVResults[m[1]] = performCV(m[0], m[1], 5)

In [None]:
# Note if these aren't the same in the report, they were re-run
CVResults # Display the Cross Validation results for each model