In [None]:
#Load starting packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


**Forest Covertype Prediction Strategy**

1. Perform brief exploratory analysis of all predictor variables, look for missing data, distribution of response, correlations. 
2. Fit a baseline random forest model based on the raw data. Analysis goal will be to better this model. 
3. Measure feature importances based on baseline model. Further examine data correlations and relationships between predictors. Use this information to perform basic feature engineering such as predictor transformations, combinations, and the creation of additional predictors. 
4. Split training data into a training and validation set. Fit multiple multivariate classification models based on modified data, evaluate model performance and tune parameters based on results from validation data. 
5. Select the best fitting model and use it predict on test data. 


In [None]:
#Load data and save response as target
train_raw = pd.read_csv('../input/forest-cover-type-prediction/train.csv', index_col= 'Id')
test_all = pd.read_csv('../input/forest-cover-type-prediction/test.csv', index_col='Id')

target = train_raw['Cover_Type']

In [None]:
train_raw.describe()

Soil_Type7 and Soil_Type15 both are not present from the training data, so they are dropped from both training and test. The good news is that there appear to be no missing values. 

In [None]:
train_raw.drop(['Soil_Type7', 'Soil_Type15'], axis = 1, inplace = True)
test_all.drop(['Soil_Type7', 'Soil_Type15'],axis =1, inplace = True)

In [None]:
train_raw.info()

After loading data and dropping the response from the training data, split the training data into training and validation sets (I use a 90/10 split). Make sure to stratify the target incase the distribution of the response is skewed. 

Plot the distribution of cover types in the training data.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# set the figure size
plt.figure(figsize=(10,8))
y_target = np.array(target)
data = y_target/len(y_target)*100
chart = sns.countplot(data,
    palette=('Set3')
)
chart.set_xticklabels(["Spruce/Fir (1)","Lodgepole Pine (2)","Ponderosa Pine (3)","Cottonwood/Willow (4)","Aspen (5)",
                       "Douglas-fir (6)","Krummholtz (7)"],rotation=45)

It looks like there is a perfectly even distribution of cover_types. I wonder if this distribution is maintained in the test data? 

Next, look at variable correlations. 

In [None]:
corrMatrix = train_raw.corr()

corrAbs = corrMatrix.abs().unstack()
corrSorted = corrAbs.sort_values(ascending = False).drop_duplicates()

corrSorted.head(50)

Some high correlations between Hillshade variables, distance to hydrology. Makes sense since these variables seem interrelated. 

Before doing anything else, split the training data into training and test, and evaluate performance of the untouched data. 

In [None]:
train_all = train_raw.copy()
train_all.drop(['Cover_Type'], axis=1, inplace=True)

X_train, X_valid, Y_train, Y_valid = train_test_split(train_all, target, 
                                                                train_size=0.9, test_size=0.1, random_state=5, stratify = target)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 20, n_jobs = -1, n_estimators = 100, bootstrap = True, max_depth = 50,
                            max_features = 0.5)
rfc.fit(X_train, Y_train)

pred_valid_rf = rfc.predict(X_valid)

from sklearn import metrics
print(metrics.accuracy_score(Y_valid, pred_valid_rf))

The raw data already performs extremely well on the validation data, with an accuracy of 89%. This is much higher than the scores people are reporting for test data accuracy, which leads me to suspect that the training data may not be representative of the test data (at least in covertype distribution). We also know that the test data is much larger than the training data. 

Next step: establish baseline performance on test data: 

In [None]:
pred_test_rf = rfc.predict(test_all)

In [None]:
#output = pd.DataFrame({'Id': test_all.index,
#                       'Cover_Type': pred_test_rf})
#output.head()
#output.to_csv('submission.csv', index=False)

Test data performs much worse than training data, with results showing 74.7% accuracy. Given the 14% difference in accuracy between the test data and validation data, will improving the training model (and validation score) actually lead to improvements in the test score? We'll find out. 

Next step: evaluate feature importance. 

In [None]:
importance = rfc.feature_importances_

from matplotlib import pyplot
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
train_all.columns

Elevation has the largest share of variable importance, and the majority of the Soil Type variables contribute little to the model. 

Next, let's take another look a different look at variable correlations. 

In [None]:
from scipy import stats
from scipy.cluster import hierarchy as hc

corr = np.round(stats.spearmanr(train_all).correlation, 2)
plt.figure(figsize=(20,20))

hc.dendrogram(hc.linkage(hc.distance.squareform(1-corr), 
                         method='average'), 
              labels=train_all.columns, orientation='left', 
              leaf_font_size=14)
plt.show()

Again, we see strong relationships between Hillshade predictors, distance to Hydrology Predictors, Elevation/Distance to roadways and fire points, and certain Wilderness areas with some soil types. 

Scatter plots will be used to investigate some of these relationships and determine if variable transformation or combination would be appropriate.

In [None]:
from matplotlib import pyplot
# elevation buckets for every 250 feet

colors = train_raw['Cover_Type']
pyplot.scatter(train_raw['Elevation'], train_raw['Horizontal_Distance_To_Roadways'], c=colors, cmap = 'Paired')

In [None]:
pyplot.scatter(train_raw['Elevation'], train_raw['Horizontal_Distance_To_Fire_Points'], c=colors, cmap = 'Paired')

In [None]:
pyplot.scatter(train_raw['Horizontal_Distance_To_Roadways'], train_raw['Horizontal_Distance_To_Fire_Points'], c=colors, cmap = 'Paired')

In [None]:
pyplot.scatter((train_raw['Hillshade_3pm'] + train_raw['Hillshade_Noon'] + train_raw['Hillshade_9am']), (train_raw['Elevation']), c=colors, cmap = 'Paired')

In [None]:
pyplot.scatter(train_raw['Vertical_Distance_To_Hydrology'], train_raw['Horizontal_Distance_To_Hydrology'], c=colors, cmap = 'Paired')

More notes on variables. Elevation shows the most distinct differences between covertype groups, indicating that a binned elevation predictor may be useful. Also since the test data is much larger than the training data, and performs differently, I'm not going to remove any predictors. Instead I'm going to focus on creating new ones that highlight similarities in the data. I also want to combine elevation with other predictors.

Note: I did not add all these new predictors at once. Instead I added a few at a time and then checked the results on the validation data. For most of the models, and especially the extra trees model, accuracy kept increasing as predictors were added. 

In [None]:
import math
cols = list(train_all.columns)

for data in [train_raw, test_all]:
    data['Hillshade'] = data['Hillshade_9am'] + data['Hillshade_3pm'] + data['Hillshade_Noon']
    data['binned_elev'] = [math.floor(v/50.0) for v in data['Elevation']]
    data['Elevation_Fire_Points'] = data['Elevation']+data['Horizontal_Distance_To_Fire_Points']
    data['Road_Fire'] = data['Horizontal_Distance_To_Roadways'] + data['Horizontal_Distance_To_Fire_Points']
    data['Road-Fire'] = data['Horizontal_Distance_To_Roadways'] - data['Horizontal_Distance_To_Fire_Points']
    data['Ele_Road_Fire_Hydro'] = data['Elevation'] + data['Horizontal_Distance_To_Roadways']  + data['Horizontal_Distance_To_Fire_Points'] + data['Horizontal_Distance_To_Hydrology']
    data['Ele-Road'] = data['Elevation'] + data['Horizontal_Distance_To_Roadways']
    data['Ele_Road'] = data['Elevation'] - data['Horizontal_Distance_To_Roadways']
    data['Ele-Fire'] = data['Elevation'] + data['Horizontal_Distance_To_Fire_Points']
    data['Ele_Fire'] = data['Elevation'] - data['Horizontal_Distance_To_Fire_Points']
    data['Ele_Hillshade'] = data['Elevation'] - data['Hillshade']
    data['Ele-Hillshade'] = data['Elevation'] + data['Hillshade']
    #None elevation combos:
    data['Soil_W1'] = data['Soil_Type29'] + data['Wilderness_Area1']
    data['Soil_W4'] = data['Wilderness_Area4'] + data['Soil_Type3']
    data['Hydrology_Total'] = abs(data["Horizontal_Distance_To_Hydrology"])+abs(data['Vertical_Distance_To_Hydrology'])
    #Summary metrics
    data["mean"] = data[cols].mean(axis=1)
    data["min"] = data[cols].min(axis=1)
    data["max"] = data[cols].max(axis=1)
    data["std"] = data[cols].std(axis=1)

In [None]:
train_all = train_raw.copy()
train_all.drop(['Cover_Type'], axis=1, inplace=True)

X_train, X_valid, Y_train, Y_valid = train_test_split(train_all, target, 
                                                                train_size=0.9, test_size=0.1, random_state=5, stratify = target)

from sklearn.ensemble import RandomForestClassifier
rfc2 = RandomForestClassifier(random_state = 20, n_jobs = -1, n_estimators = 100, bootstrap = True, max_depth = 50,
                            max_features = 0.5)
rfc2.fit(X_train, Y_train)

pred_valid_rf2 = rfc2.predict(X_valid)

from sklearn import metrics
print(metrics.accuracy_score(Y_valid, pred_valid_rf2))


Try combining classifiers

In [None]:
from catboost import CatBoostClassifier
cbc = CatBoostClassifier(random_state = 20, iterations = 3000, learning_rate = 0.03,od_wait = 1000,
                         depth = 7, l2_leaf_reg = 3, eval_metric = 'Accuracy', verbose = 1000)
cbc.fit(X_train, Y_train)

pred_valid_cbc = cbc.predict(X_valid)
print(metrics.accuracy_score(Y_valid, pred_valid_cbc))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state = 20, n_jobs = -1, max_features = 'auto')

etc.fit(X_train, Y_train)
pred_valid_etc = etc.predict(X_valid)
pred_test_etc = etc.predict(test_all)

print(metrics.accuracy_score(Y_valid, pred_valid_etc))

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=20, learning_rate = 0.4, objective ='multi:softprob', num_class = 7, eval_metric= 'merror',
                   verbose = False)

xgb.fit(X_train, Y_train)
pred_valid_xgb = xgb.predict(X_valid)
print(metrics.accuracy_score(Y_valid, pred_valid_xgb))

In [None]:
import lightgbm as lgb

lb = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42, application = 'multiclass')

lb.fit(X_train, Y_train)

pred_valid_lb = lb.predict(X_valid)
print(metrics.accuracy_score(Y_valid, pred_valid_lb))

In [None]:
output = pd.DataFrame({'Id': test_all.index,
                       'Cover_Type': pred_test_etc})
output.head()
output.to_csv('submission.csv', index=False)