In [None]:
# Data wrangling and visualization
import pandas as pd
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns

# For machine learning:
# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split
# Preprocessing
from sklearn.preprocessing import LabelEncoder
# For feature selection
from sklearn.decomposition import PCA
# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# Classifier evaluation
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

# Encode dataframe columns
def label_encode(df, columns):
    for col in columns:
        le = LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        #le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed

# classify() method for model training, cross validation and evaluation report:
def classify(predictors, response, classifier = 'svm', kern='rbf', neighbors=3, kfolds=0, report=False):
    # split X and y into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(predictors, response, test_size=0.20, random_state=1)

    
    if classifier == 'nb':
        cl = GaussianNB() # instantiate model
        msg = 'Naive Bayes'
    elif classifier == 'svm':
        cl = svm.SVC(kernel=kern) # instantiate model
        msg = 'SVM with ' + kern + ' kernel'
    elif classifier == 'knn':
        cl = KNeighborsClassifier(n_neighbors=neighbors)
        msg = 'KNN with k=' + str(neighbors)
    elif classifier == 'rf':
        # Instantiate model with  decision trees
        # Note that the number of decision trees is denoted
        # using the neighbors parameter, which is set to 3 by default
        cl = RandomForestClassifier(n_estimators = 1000, random_state = 42)
        msg = 'Random Forest with ' + str(neighbors) + ' decision trees'
        

        
    elif classifier == 'dtree':
        cl = DecisionTreeClassifier(min_samples_split=20, random_state=99)
        msg = 'Decision tree'
        
    model = cl.fit(X_train, y_train)
    y_pred_class = model.predict(X_test)
    print(msg + ' model accuracy score: ', metrics.accuracy_score(y_test, y_pred_class.round()))
    
    if kfolds > 0:
        # Perform k-fold cross validation
        scores = cross_val_score(model, X, y, cv=kfolds, scoring='accuracy')
        print('Cross-validated score:', scores.mean())
        
        # Plot ROC Curve and report AUC Statistic
        print("AUC score: ", metrics.roc_auc_score(y_test, y_pred_class))
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_class)
        roc_auc = metrics.auc(fpr, tpr)
        
        # Compute ROC curve and ROC area for each class
        #fpr = dict()
        #tpr = dict()
        #roc_auc = dict()
        #for i in range(n_classes):
        #    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        #    roc_auc[i] = auc(fpr[i], tpr[i])
        
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.show()
        

        
    if report == True:
        try:
            print('Confusion matrix')
            print(metrics.confusion_matrix(y_test, y_pred_class)) # confusion matrix
            print('Classification report')
            print(metrics.classification_report(y_test, y_pred_class))
        except ValueError:
            print("ValueError")
        if classifier == 'rf':
            feature_imp = pd.Series(cl.feature_importances_,index=list(predictors)).sort_values(ascending=False)
            # Creating a bar plot
            sns.barplot(x=feature_imp, y=feature_imp.index)
            # Add labels to your graph
            plt.xlabel('Feature Importance Score')
            plt.ylabel('Features')
            plt.title("OSM Geographic Feature Importance")
            plt.figure(figsize=(18,12))
            plt.show()
    
    print('______________________________')
    
    return cl