In [1]:
# Import Packages
# import kaggle
import numpy as np
import pandas as pd
import re 

# Learning libs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Producing Decision Tree diagrams
from IPython.display import Image, display
import pydotplus
from subprocess import call

# For other 
import copy
from textwrap import wrap

np.random.seed(0)

In [2]:
# Load CSVs
# Import Train and Dev data from processed data
train_data = pd.read_csv('../data/processed/train_data.csv').set_index('Id')
train_labels = pd.read_csv('../data/processed/train_labels.csv').set_index('Id')
dev_data = pd.read_csv('../data/processed/dev_data.csv').set_index('Id')
dev_labels = pd.read_csv('../data/processed/dev_labels.csv').set_index('Id')

In [3]:
#train_data.head()

In [4]:
#train_labels.head()

In [5]:
#dev_data.head()
#dev_data.columns

In [6]:
#dev_labels.head()

In [7]:
# Removing final column that includes CoverType
train = train_data.copy()
dev = dev_data.copy()

# Total_Distance_to_Hydrology
# ------------------------------------------------------------------------------
# Create Total_Distance_to_Hydrology based on Euclidean distance
train['Total_Distance_To_Hydrology'] = np.sqrt(train_data["Horizontal_Distance_To_Hydrology"]**2 + train_data['Vertical_Distance_To_Hydrology']**2)
dev['Total_Distance_To_Hydrology'] = np.sqrt(dev_data["Horizontal_Distance_To_Hydrology"]**2 + dev_data['Vertical_Distance_To_Hydrology']**2)
train[["Total_Distance_To_Hydrology", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology"]].head(10)

Unnamed: 0_level_0,Total_Distance_To_Hydrology,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4429,0.0,0,0
12400,0.0,0,0
4648,215.564376,192,98
5954,0.0,0,0
2947,0.0,0,0
9253,205.548048,201,43
3472,384.480169,324,207
3079,313.180459,309,51
10572,296.325834,295,28
11892,212.084889,212,6


In [8]:
# Print all columns
print(dev.columns)
dev.iloc[:,10:14]

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

Unnamed: 0_level_0,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5415,0,0,0,1
6658,0,0,1,0
3551,0,0,0,1
5155,0,0,0,1
10748,0,0,1,0
...,...,...,...,...
13124,0,0,0,1
3265,0,0,0,1
9846,0,0,1,0
10800,0,0,1,0


# Model Development

### Normalizing continuous features (GaussianNB)

First, let's look at a Confusion Matrix after normalizing and using all the features. We want to differentiate between those that get confused most often.

First, I normalized all columns that have numerical data. Columns with binary data stayed as is.

In [9]:
# ColumnTransformer only transforms select columns
# StandardScalar normalizes the data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Features to normalize
feature_cols = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Roadways",
                "Horizontal_Distance_To_Fire_Points", "Total_Distance_To_Hydrology",
                "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]

ct = ColumnTransformer([
        ('features', StandardScaler(), feature_cols)
    ], remainder='passthrough')

# Transform features
nX_train = ct.fit_transform(train)
nX_dev = ct.transform(dev)

# Capture labels
y_train = train_labels.Cover_Type
y_dev = dev_labels.Cover_Type

# Sanity Check!
pd.DataFrame(nX_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,-1.269012,-1.260394,-0.535135,-0.672762,-1.059503,-1.096632,-0.048712,-0.173892,0.101598,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.463055,-0.343459,-0.771915,-0.593370,-0.787843,-1.096632,0.796328,0.571621,-0.225332,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.096529,-0.434245,1.122330,-0.636469,-0.073480,-0.095667,1.316353,-0.787845,-1.685622,192.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.969562,-0.588580,-0.416745,-0.951770,-0.885714,-1.096632,0.893833,0.045376,-0.574058,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.920614,1.726454,-0.416745,-1.162726,-0.870164,-1.096632,-0.601239,-0.042331,0.581096,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12091,0.072523,-0.143730,-0.061574,0.751007,0.443313,1.433698,0.958836,0.659329,-0.421491,525.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12092,0.388742,-1.169608,0.056817,0.187700,0.610699,-0.654624,-0.048712,-0.743991,-0.268923,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12093,-1.618769,0.537162,0.530378,-0.491294,-1.074138,-1.096632,-0.601239,1.536404,1.169571,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12094,-0.792288,-0.697523,0.411988,-0.799034,-0.176838,1.186007,0.958836,-0.831699,-1.249715,484.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# This model using all features, with continuous vars normalized

# Define model
model = GaussianNB(var_smoothing=0.001)
model.fit(nX_train, y_train)

# Predict the class labels for the provided data
pred_labels = model.predict(nX_dev)

# Confusion Matrix
cm = confusion_matrix(y_dev, pred_labels)
print('Confusion Matrix: \n\n', cm, "\n")
print("Accuracy: %3.4f" %accuracy_score(y_dev, pred_labels))

Confusion Matrix: 

 [[ 55  23  29 109   1 145  74]
 [ 69  31  54  55   0 158  67]
 [ 12  11 103  68   1 190  38]
 [ 14   2  77 264   0  77  21]
 [ 16  11  82 104   0 176  36]
 [  6  11  84 109   0 194  24]
 [ 38  20  26  86   0 124 129]] 

Accuracy: 0.2566


### Dropping all binary features (GaussianNB)

The accuracy of the previous model using all features performed very poorly. Let's first drop all binary features, namely Soil Types and Wilderness Areas, as well as Horizontal and Vertical Distance to Hydrology (since we now have the Total Distance to Hydrology).

In [11]:
# Gaussian Naive Bayes - Continuous Data Only
# -----------------------------------------------------------------------------
def naive_bayes_con():
    # naive bayes cleaning/training/testing
    
    # Features to be used in model
#     feature_cols = ["Elevation", "Aspect", "Slope", "Total_Distance_To_Hydrology",
#                   "Horizontal_Distance_To_Roadways",
#                   "Horizontal_Distance_To_Fire_Points", "Hillshade_9am",
#                   "Hillshade_Noon", "Hillshade_3pm"]
    
    # Drop features
    drop = ['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
           'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
           'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
           'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
           'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
           'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
           'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
           'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
           'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
           'Soil_Type39', 'Soil_Type40', "Horizontal_Distance_To_Hydrology",
            "Vertical_Distance_To_Hydrology", 'Wilderness_Area1', 'Wilderness_Area2',
           'Wilderness_Area3', 'Wilderness_Area4']

    nX_train = train.drop(drop, axis = 1)
    nX_dev = dev.drop(drop, axis = 1)
    
    # Using StandardScalar() to normalize data
    scaler = StandardScaler()
    nX_train = scaler.fit_transform(nX_train)
    nX_dev = scaler.transform(nX_dev)

    # Create new dataframes with normalized values of data
    y_train = train_labels.Cover_Type
    y_dev = dev_labels.Cover_Type

    # Generating initial model
    #smoothing_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    #for num in smoothing_list:

    model = GaussianNB(var_smoothing=0.00001)
    model.fit(nX_train, y_train)
    pred_y_train = model.predict(nX_train)
    pred_y_dev = model.predict(nX_dev)

    # Print the results
    print('\nInitial Train Accuracy:', model.score(nX_train, y_train))
    print('Initial Train F1 score:', metrics.f1_score(y_train, pred_y_train, average = 'weighted'), '\n')
    print('Initial Dev Accuracy:', model.score(nX_dev, y_dev))
    print('Initial Dev F1 score:', metrics.f1_score(y_dev, pred_y_dev, average = 'weighted'), '\n')
    
    # Confusion Matrix
    cm = confusion_matrix(dev_labels, pred_y_dev)
    print('Confusion Matrix: \n\n', cm, "\n")
    
    # Adjusting sigma to test results
    print("Mean sigma:", np.mean(model.sigma_), '\n')

    sigma = 0.79
    model.sigma_ = np.full((7,nX_train.shape[1]), sigma)
    train_accuracy = model.score(nX_train, y_train)
    dev_accuracy = model.score(nX_dev, y_dev)
    pred_y_train = model.predict(nX_train)
    pred_y_dev = model.predict(nX_dev)
    train_f1_score = metrics.f1_score(y_train, pred_y_train, average = 'weighted')
    dev_f1_score = metrics.f1_score(y_dev, pred_y_dev, average = 'weighted')

    # Print the results
    print('Train Accuracy for constant sigma =', sigma, ':', train_accuracy)
    print('Train F1 score for constant sigma = ', sigma, ':', train_f1_score, '\n')
    print('Dev Accuracy for constant sigma =', sigma, ':', dev_accuracy)
    print('Dev F1 score for constant sigma = ', sigma, ':', dev_f1_score, '\n\n')
    
    # Confusion Matrix
    cm = confusion_matrix(dev_labels, pred_y_dev)
    print('Confusion Matrix: \n\n', cm, "\n")


    
naive_bayes_con()


Initial Train Accuracy: 0.6025132275132276
Initial Train F1 score: 0.5911853094372181 

Initial Dev Accuracy: 0.6025132275132276
Initial Dev F1 score: 0.5909465797445023 

Confusion Matrix: 

 [[239  54   0   0  37   4 102]
 [119 164  13   1  95  17  25]
 [  0   6 164  93  54 106   0]
 [  0   0  28 378   0  49   0]
 [  3  80   6   0 305  31   0]
 [  0  14 101  65  43 205   0]
 [ 52   1   0   0   3   0 367]] 

Mean sigma: 0.786993279128291 

Train Accuracy for constant sigma = 0.79 : 0.4582506613756614
Train F1 score for constant sigma =  0.79 : 0.44551342430003094 

Dev Accuracy for constant sigma = 0.79 : 0.44775132275132273
Dev F1 score for constant sigma =  0.79 : 0.431144389067487 


Confusion Matrix: 

 [[151  64   3   0  96  15 107]
 [ 91 128  10   5 112  23  65]
 [  0  23  66 122  74 138   0]
 [  0   3  31 348   6  67   0]
 [ 30  86   9  36 235  26   3]
 [  0  29  39  96  53 211   0]
 [148   2   0   0  56   2 215]] 



It seems that using only the continuous data worked a lot better than before. However, I am sure there is a way to transform the binary data so that including the transformed columns will improve the model. Or, as I will attempt below, we may want to use the output of two models as input to a final model. 

Before this, however, let's drop the continuous features and model only the binary features.

### Dropping the continuous features (GaussianNB)

Now let's try dropping the continuous features. This way, the confusion matrix may help us determine specifically which classes are being mixed up based on soil type and wilderness area alone.

In [12]:
# Gaussian Naive Bayes - Binary Data Only using GAUSSIANNB
# -----------------------------------------------------------------------------
def naive_bayes_bin():
    # naive bayes cleaning/training/testing
    
    # Drop features
    drop = ["Elevation", "Aspect", "Slope", "Total_Distance_To_Hydrology",
            "Horizontal_Distance_To_Roadways", "Horizontal_Distance_To_Hydrology",
            "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Fire_Points",
            "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]

    nX_train = train.drop(drop, axis = 1)
    nX_dev = dev.drop(drop, axis = 1)
    print(nX_train.columns, "\n")

    # Create new dataframes with normalized values of data
    y_train = train_labels.Cover_Type
    y_dev = dev_labels.Cover_Type

    # Generating initial model
    #smoothing_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    #for num in smoothing_list:

    model = GaussianNB(var_smoothing=0.00001)
    model.fit(nX_train, y_train)
    pred_y_train = model.predict(nX_train)
    pred_y_dev = model.predict(nX_dev)

    # Print the results
    print('\nInitial Train Accuracy:', model.score(nX_train, y_train))
    print('Initial Train F1 score:', metrics.f1_score(y_train, pred_y_train, average = 'weighted'), '\n')
    print('Initial Dev Accuracy:', model.score(nX_dev, y_dev))
    print('Initial Dev F1 score:', metrics.f1_score(y_dev, pred_y_dev, average = 'weighted'), '\n')
    
    # Confusion Matrix
    cm = confusion_matrix(y_dev, pred_y_dev)
    print('Confusion Matrix for Dev: \n\n', cm, "\n")
    
    # Adjusting sigma to test results
    print("Mean sigma:", np.mean(model.sigma_), '\n')

    sigma = 0.028
    model.sigma_ = np.full((7,nX_train.shape[1]), sigma)
    train_accuracy = model.score(nX_train, y_train)
    dev_accuracy = model.score(nX_dev, y_dev)
    pred_y_train = model.predict(nX_train)
    pred_y_dev = model.predict(nX_dev)
    train_f1_score = metrics.f1_score(y_train, pred_y_train, average = 'weighted')
    dev_f1_score = metrics.f1_score(y_dev, pred_y_dev, average = 'weighted')
    

    # Print the results
    print('Train Accuracy for constant sigma =', sigma, ':', train_accuracy)
    print('Train F1 score for constant sigma = ', sigma, ':', train_f1_score, '\n')
    print('Dev Accuracy for constant sigma =', sigma, ':', dev_accuracy)
    print('Dev F1 score for constant sigma = ', sigma, ':', dev_f1_score, '\n\n')
    
    # Confusion Matrix
    cm = confusion_matrix(y_dev, pred_y_dev)
    print('Confusion Matrix for Dev: \n\n', cm, "\n")


    
naive_bayes_bin()

Index(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'],
      dtype='object') 


Initial Train Accuracy: 0.45849867724867727
Initial Train F1 score: 0.38043065166553397 

Initial Dev Accuracy: 0.4679232804232804
Initial Dev F1 score: 0.38680087268104096 

Confusion Matrix for Dev: 

 [[ 65   7   4   0 197 

### Dropping the continuous features (BernoulliNB)

Evidently, using a BernoulliNB model would be more appropriate in this case.

In [13]:
# Naive Bayes - Binary Data Only using BERNOULLINB
# -----------------------------------------------------------------------------
def naive_bayes_bern():
    # naive bayes cleaning/training/testing
    
    # Drop features
    drop = ["Elevation", "Aspect", "Slope", "Total_Distance_To_Hydrology",
            "Horizontal_Distance_To_Roadways", "Horizontal_Distance_To_Hydrology",
            "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Fire_Points",
            "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]

    nX_train = train.drop(drop, axis = 1)
    nX_dev = dev.drop(drop, axis = 1)
    print(nX_train.columns, "\n")

    # Create new dataframes with normalized values of data
    y_train = train_labels.Cover_Type
    y_dev = dev_labels.Cover_Type

    # Generating initial model
    #smoothing_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    #for num in smoothing_list:

    model = BernoulliNB(alpha=0.00001)
    model.fit(nX_train, y_train)
    pred_y_train = model.predict(nX_train)
    pred_y_dev = model.predict(nX_dev)

    # Print the results
    print('\nInitial Train Accuracy:', model.score(nX_train, y_train))
    print('Initial Train F1 score:', metrics.f1_score(y_train, pred_y_train, average = 'weighted'), '\n')
    print('Initial Dev Accuracy:', model.score(nX_dev, y_dev))
    print('Initial Dev F1 score:', metrics.f1_score(y_dev, pred_y_dev, average = 'weighted'), '\n')
    
    # Confusion Matrix
    cm = confusion_matrix(y_dev, pred_y_dev)
    print('Confusion Matrix for Dev: \n\n', cm, "\n")


    
naive_bayes_bern()

Index(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'],
      dtype='object') 


Initial Train Accuracy: 0.5794477513227513
Initial Train F1 score: 0.5748263952129261 

Initial Dev Accuracy: 0.5846560846560847
Initial Dev F1 score: 0.5806767674313877 

Confusion Matrix for Dev: 

 [[172 132   3   0  79   0

## Combining Bernoulli and Gaussian models

Let's look at creating two separate models -- one Bernoulli and one Gaussian. Then, we will use the probabilites from those models as input to a final Gaussian model.

In [14]:
# Creating new dataframes with original columns + Total_Distance_To_Hydrology
train2 = train_data.copy()
dev2 = dev_data.copy()

# Total_Distance_to_Hydrology
# ------------------------------------------------------------------------------
# Create Total_Distance_to_Hydrology based on Euclidean distance
train2['Total_Distance_To_Hydrology'] = np.sqrt(train_data["Horizontal_Distance_To_Hydrology"]**2 + train_data['Vertical_Distance_To_Hydrology']**2)
dev2['Total_Distance_To_Hydrology'] = np.sqrt(dev_data["Horizontal_Distance_To_Hydrology"]**2 + dev_data['Vertical_Distance_To_Hydrology']**2)
train2[["Total_Distance_To_Hydrology", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology"]].head(10)

Unnamed: 0_level_0,Total_Distance_To_Hydrology,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4429,0.0,0,0
12400,0.0,0,0
4648,215.564376,192,98
5954,0.0,0,0
2947,0.0,0,0
9253,205.548048,201,43
3472,384.480169,324,207
3079,313.180459,309,51
10572,296.325834,295,28
11892,212.084889,212,6


In [15]:
# Splitting the data into two different dataframes:

# Continuous Data Columns
continuous_cols =  ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Roadways",
                   "Horizontal_Distance_To_Fire_Points", "Total_Distance_To_Hydrology",
                   "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]

# Binary / Categorical Data Columns
categorical_cols = ['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
                   'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
                   'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
                   'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
                   'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
                   'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
                   'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
                   'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
                   'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
                   'Soil_Type39', 'Soil_Type40', 'Wilderness_Area1', 'Wilderness_Area2',
                   'Wilderness_Area3', 'Wilderness_Area4']

# Defining new train and dev data sets split by above lists
# Continuous dataframes
Xtrain_G = train2[continuous_cols]
Xdev_G = dev2[continuous_cols]
# Categorical dataframes
Xtrain_C = train2[categorical_cols]
Xdev_C = dev2[categorical_cols]

In [16]:
# Sanity Check
Xdev_C.head()

Unnamed: 0_level_0,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,...,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5415,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
6658,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3551,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10748,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
def two_models_combined():
    
    # Make copies of the categorical dataframes and rename
    cX_train = Xtrain_C.copy()
    cX_dev = Xdev_C.copy()

    # Normalize continuous variables and create continuous dataframes
    scaler = StandardScaler()
    nX_train = scaler.fit_transform(Xtrain_G)
    nX_dev = scaler.transform(Xdev_G)
    
    # Define labels
    y_train = train_labels.Cover_Type
    y_dev = dev_labels.Cover_Type

    # Create a Categorical Model and a Gaussian Model
    model_G = GaussianNB()
    model_G.fit(nX_train, y_train)
    print("Gaussian Model Dev Accuracy: %3.4f" %model_G.score(nX_dev, y_dev))
    model_C = BernoulliNB()
    model_C.fit(cX_train, y_train)
    print("Bernoulli Model Dev Accuracy: %3.4f" %model_C.score(cX_dev, y_dev), '\n')

    # Get probability predictions for each model
    # On training data
    G_train_probs = model_G.predict_proba(nX_train)
    C_train_probs = model_C.predict_proba(cX_train)

    # On dev data
    G_dev_probs = model_G.predict_proba(nX_dev)
    C_dev_probs = model_C.predict_proba(cX_dev)

    # Combine probability prediction for class=1 from both models into a 2D array
    # np.c_ translates slice objects to concatenation along the second axis.
    X_new_train = np.c_[(G_train_probs[:,1], C_train_probs[:,1])] # Train
    X_new_dev = np.c_[(G_dev_probs[:,1], C_dev_probs[:,1])] # Dev

    # Fit final Gaussian model
    # Using the probabilities from the last two modes as input
    model = GaussianNB()
    clf = model.fit(X_new_train, y_train)

    # Predict class labels on dev data
    pred_labels_train = model.predict(X_new_train)
    pred_labels_dev = model.predict(X_new_dev)

    # Print results
    train_score = model.score(X_new_train, y_train)
    dev_score = model.score(X_new_dev, y_dev)
    print('Combined Model Train Accuracy: %3.4f' %train_score)
    print('Combined Model Dev Accuracy: %3.4f' %dev_score)
    
two_models_combined()

Gaussian Model Dev Accuracy: 0.6025
Bernoulli Model Dev Accuracy: 0.5827 

Combined Model Train Accuracy: 0.3357
Combined Model Dev Accuracy: 0.3528


While that worked better than simply throwing all the data into one model (as was done in the beginning), it still isn't better than when we removed the binary data columns.

### Creating a Multinomial Model (MultinomialNB)

Now, I'll take the continuous features and transform them into multinomial data.

In [2]:
# Creating thresholds based on quantiles

def define_thresholds(data, num_divisions):
    thresholds = np.zeros([len(data.columns), num_divisions-1])
    step = round(1/num_divisions,2)
    prev = 0
    bins = []
    for i in range(num_divisions-1):
        num = prev
        bins.append(num+step)
        prev = num+step
    print("Quantile Cutoffs:", bins)
    
    i = 0
    for item in Xtrain_G.columns:
        for j in range(len(bins)):
            thresholds[i][j] = int(data[item].quantile(bins[j]))
        i+=1
    return thresholds


# threshold = NxM array, N = number of columns, M = number of bins
# inputs: dataframe, threshold values
# outputs: new dataframe
def multifeature(data, thresholds):
    # capture column names
    features = list(data.columns)
    
    # initiate a new dataframe 
    new_df = data.copy()

    i=0
    # bin the data
    for feature in features:
        new_df[feature] = np.digitize(np.array(data[feature]), thresholds[i])
        i+=1
        
    return new_df

num_divisions = 4
thresholds = define_thresholds(Xtrain_G, num_divisions)
print("Thresholds for Train Data:\n", thresholds, '\n')
train_df = multifeature(Xtrain_G, thresholds)
dev_df = multifeature(Xdev_G, thresholds)

#thresholds = define_thresholds(Xdev_G, num_divisions)
#print("Thresholds for Dev Data:\n", thresholds)

# This will merge our newly created multinomial features with the original binary features
Xtrain_multi = pd.merge(train_df, Xtrain_C, left_on='Id', right_on='Id', how='left')
Xtrain_multi.shape

Xdev_multi = pd.merge(dev_df, Xdev_C, left_on='Id', right_on='Id', how='left')
Xdev_multi.shape

NameError: name 'Xtrain_G' is not defined

In [19]:
# Naive Bayes - using MultinomialNB
# -----------------------------------------------------------------------------
def naive_bayes_multi():

    # Create new dataframes with normalized values of data
    y_train = train_labels.Cover_Type
    y_dev = dev_labels.Cover_Type

    # Generating initial model
    smoothing_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
    for num in smoothing_list:

        model = MultinomialNB(alpha=num)
        model.fit(Xtrain_multi, y_train)
        pred_y_train = model.predict(Xtrain_multi)
        pred_y_dev = model.predict(Xdev_multi)

        # Print the results
        print('\nInitial Train Accuracy:', model.score(Xtrain_multi, y_train))
        print('Initial Train F1 score:', metrics.f1_score(y_train, pred_y_train, average = 'weighted'), '\n')
        print('Initial Dev Accuracy:', model.score(Xdev_multi, y_dev))
        print('Initial Dev F1 score:', metrics.f1_score(y_dev, pred_y_dev, average = 'weighted'), '\n')

        # Confusion Matrix
        cm = confusion_matrix(y_dev, pred_y_dev)
        print('Confusion Matrix for Dev: \n\n', cm, "\n")

naive_bayes_multi()


Initial Train Accuracy: 0.6271494708994709
Initial Train F1 score: 0.626000443375661 

Initial Dev Accuracy: 0.6253306878306878
Initial Dev F1 score: 0.6257925845137267 

Confusion Matrix for Dev: 

 [[228 106   3   0  48   1  50]
 [ 89 245   7   0  77  13   3]
 [  0   2 194  53  38 136   0]
 [  0   0  51 380   0  24   0]
 [ 67  57  25   0 262  14   0]
 [  5  11  99  27  42 244   0]
 [ 39  35   0   0  11   0 338]] 


Initial Train Accuracy: 0.6271494708994709
Initial Train F1 score: 0.626000443375661 

Initial Dev Accuracy: 0.6253306878306878
Initial Dev F1 score: 0.6257925845137267 

Confusion Matrix for Dev: 

 [[228 106   3   0  48   1  50]
 [ 89 245   7   0  77  13   3]
 [  0   2 194  53  38 136   0]
 [  0   0  51 380   0  24   0]
 [ 67  57  25   0 262  14   0]
 [  5  11  99  27  42 244   0]
 [ 39  35   0   0  11   0 338]] 


Initial Train Accuracy: 0.6271494708994709
Initial Train F1 score: 0.626000443375661 

Initial Dev Accuracy: 0.6253306878306878
Initial Dev F1 score: 0.62579

## Transforming variables for testing

In [20]:
# Copy data to new variable
train3 = train_data.copy()
dev3 = dev_data.copy()

# Total_Distance_to_Hydrology
# ------------------------------------------------------------------------------
# Create Total_Distance_to_Hydrology based on Euclidean distance
train3['Total_Distance_To_Hydrology'] = np.sqrt(train_data["Horizontal_Distance_To_Hydrology"]**2 + train_data['Vertical_Distance_To_Hydrology']**2)
dev3['Total_Distance_To_Hydrology'] = np.sqrt(dev_data["Horizontal_Distance_To_Hydrology"]**2 + dev_data['Vertical_Distance_To_Hydrology']**2)


# Plotting features again
# Drop features (ONLY RUN ONCE)
drop = ['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40', "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", 'Wilderness_Area1', 'Wilderness_Area2',
       'Wilderness_Area3', 'Wilderness_Area4']

train4 = train3.drop(drop, axis = 1)
dev4 = dev3.drop(drop, axis = 1)

In [21]:
# PCA Analysis
from sklearn.decomposition import PCA

def pca_NB():
    # Normalize data
    # Using StandardScalar() to normalize data
    scaler = StandardScaler()
    nX_train = scaler.fit_transform(train4)
    nX_dev = scaler.transform(dev4)
    
    # Define labels
    y_train = train_labels.Cover_Type
    y_dev = dev_labels.Cover_Type

    pca = PCA(n_components='mle', svd_solver='full')

    # Create 2D projected data
    data2D = pca.fit_transform(nX_train)
    dev2D = pca.transform(nX_dev)
    print(data2D.shape)

    model = GaussianNB(var_smoothing=0.00001)
    model.fit(nX_train, y_train)
    pred_y_train = model.predict(nX_train)
    pred_y_dev = model.predict(nX_dev)

    # Print the results
    print('\nInitial Train Accuracy:', model.score(nX_train, y_train))
    print('Initial Train F1 score:', metrics.f1_score(y_train, pred_y_train, average = 'weighted'), '\n')
    print('Initial Dev Accuracy:', model.score(nX_dev, y_dev))
    print('Initial Dev F1 score:', metrics.f1_score(y_dev, pred_y_dev, average = 'weighted'), '\n')

    # Confusion Matrix
    cm = confusion_matrix(dev_labels, pred_y_dev)
    print('Confusion Matrix: \n\n', cm, "\n")

pca_NB()

(12096, 8)

Initial Train Accuracy: 0.6025132275132276
Initial Train F1 score: 0.5911853094372181 

Initial Dev Accuracy: 0.6025132275132276
Initial Dev F1 score: 0.5909465797445023 

Confusion Matrix: 

 [[239  54   0   0  37   4 102]
 [119 164  13   1  95  17  25]
 [  0   6 164  93  54 106   0]
 [  0   0  28 378   0  49   0]
 [  3  80   6   0 305  31   0]
 [  0  14 101  65  43 205   0]
 [ 52   1   0   0   3   0 367]] 

