In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
print(os.listdir("../input"))

Load the 'train.csv' and 'test.csv' file

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

train_df.head()

# Numerical features

### Get a quick description of the columns containing numerical features

In [None]:
train_df.loc[:,'Elevation':'Horizontal_Distance_To_Fire_Points'].describe()

In [None]:
test_df.loc[:,'Elevation':'Horizontal_Distance_To_Fire_Points'].describe()

**The train set is not a perfect sample of the test set.** For example the Horizontal_Distance_To_Hydrology has mean ~227 in the train set and mean ~270 in the test set. The Elevation has std ~417 in the train set and ~273 in the test set.

Check the distribution of numerical features in the train and test set.

In [None]:
plt.figure(figsize = (20, 30))

for pos, key in enumerate(train_df.columns[1:11]):
    plt.subplot(5, 2, pos+1)
    vp = plt.violinplot([train_df[key], test_df[key]])

    vp['bodies'][0].set_facecolor('#004488')
    vp['bodies'][0].set_edgecolor('black')
    vp['bodies'][0].set_alpha(1)

    vp['bodies'][1].set_facecolor('#FF4400')
    vp['bodies'][1].set_edgecolor('black')
    vp['bodies'][1].set_alpha(1)

    plt.ylabel(key, fontsize = 12)
    plt.yticks(fontsize = 12)
    plt.xticks([1,2], ['train', 'test'], fontsize = 12)

plt.show()

From the plots there is another confirmation that Elevation, Horizontal_Distance_To_Roadways and Horizontal_Distance_To_Fire_Points have **strongly different distributions in the train and test set**.

### Check the distribution of features for different cover types

In [None]:
train_df.drop(train_df.columns[11:-1], axis  = 1).groupby('Cover_Type').describe()

In [None]:
plt.figure(figsize = (20, 30))

for pos, key in enumerate(train_df.columns[1:11]):
    ax = plt.subplot(5, 2, pos+1)
    sns.violinplot(y = key, x = 'Cover_Type', data = train_df)
    plt.xticks(fontsize = 12)
    plt.yticks(fontsize = 12)
    ax.xaxis.label.set_size(12)
    ax.yaxis.label.set_size(12)

plt.show()

The elevation and the distances from hydrology, roads, and firepoints show clear **distinctions between classes**.

Note that the global distribution of the Elevation in the train set has a peak at ~2250, where significant contributions can come only from Cover_Type 4, 3 and (to a lesser extent) 6. Since the global distribution of the Elevation in the test set has not a peak at ~2250, **this could mean that there are very few test entries with Cover_Type 4 or 3, or that such classes have a very different distribution in the test set**. 

As a further consideration, the train set has exactly 2160 entries for each Cover_Type. **This could imply that the train set has been forcefully balanced** and that the proportions of the different Cover_Types don't respect the proportions in the test set.

### Visualization through PCA

Let's visualize clusters in 2D using PCA on all continous features. First we need to normalize the columns, because they have very different scales.

In [None]:
train_numerical_feat_df = train_df.iloc[:,1:11]
temp = train_numerical_feat_df - train_numerical_feat_df.mean()
train_numerical_feat_df = temp/temp.std()

train_numerical_feat_df['Cover_Type'] = train_df['Cover_Type']

train_numerical_feat_df.describe()

Now do PCA

In [None]:
from sklearn.decomposition import PCA

n_comp = 2
pca = PCA(n_components = n_comp)
pca.fit(train_numerical_feat_df.iloc[:,0:10])

print('Total variance explained by the first %d principal components = %f ' %(n_comp, sum(pca.explained_variance_ratio_)))

2 principal components are explaining only half of the variability in the data.

Let's try nevertheless to plot data according to the first 2 principal components.

In [None]:
from matplotlib.colors import ListedColormap

feat2D = pca.transform(train_numerical_feat_df.iloc[:,0:10])

old_cmap = plt.get_cmap("Set1")
my_cmap = ListedColormap(old_cmap.colors[:7])

#colors =  list(map(lambda x: my_cmap(x-1), train_cont_feat_df['Cover_Type']))
colors = train_numerical_feat_df['Cover_Type']

plt.figure(figsize = (15, 10))
plt.scatter(x = feat2D[:,0], y = feat2D[:,1], c = colors, cmap = my_cmap, vmin = 0.5, vmax = 7.5)
plt.colorbar()
plt.show()

There is some clustering of the classes (e.g. brown in the bottom-left, green in the top-right, ...) but there is still considerable overlap.

How many components are needed to accunt for 90% of the total variability?

In [None]:
n_comp = 10
pca10 = PCA(n_components = n_comp)
pca10.fit(train_numerical_feat_df.iloc[:,0:10])

print('Explained variance ratio for each component')
print(pca10.explained_variance_ratio_)

x = range(1, len(pca10.explained_variance_ratio_) + 1)
cumulative_ratios = [sum(pca10.explained_variance_ratio_[0:j+1]) for j in range(len(pca10.explained_variance_ratio_))]

plt.figure(figsize = (15, 10))
plt.plot(x, cumulative_ratios)
plt.plot(x, [0.9]*len(x), linestyle='dashed')
plt.plot([6]*8, np.arange(0.3, 1.1, 0.1), linestyle='dashed')
plt.ylabel('Explained variance for the first N components', fontsize= 12)
plt.xticks(x, fontsize= 12)
plt.yticks(fontsize= 12)
plt.show()

At least 6 components are needed to accunt for 90% of the total variability.

What are the most important combinations? (this can be useful for feature extraction)

In [None]:
print(pca10.components_[0])
print(pca10.components_[1])

Do some feature engineering

Possible features can be the total distance from water (not only horizontal or vertical), the average distance from disturbing roads/fire, the vaerage hillshade, ...

In [None]:
ext_train_df = train_df.copy()

def dist_hyd(row):
    return np.sqrt(row['Horizontal_Distance_To_Hydrology']**2 + row['Vertical_Distance_To_Hydrology']**2)

def avg_dist_bad(row):
    return 0.5*(row['Horizontal_Distance_To_Fire_Points'] + row['Horizontal_Distance_To_Roadways'])

def min_dist_bad(row):
    return np.min([row['Horizontal_Distance_To_Fire_Points'], row['Horizontal_Distance_To_Roadways']])

def avg_shade(row):
    return (row['Hillshade_9am'] + row['Hillshade_Noon'] + row['Hillshade_3pm'])/3.0

def min_shade(row):
    return np.min([row['Hillshade_9am'], row['Hillshade_Noon'], row['Hillshade_3pm']])

def max_shade(row):
    return np.max([row['Hillshade_9am'], row['Hillshade_Noon'], row['Hillshade_3pm']])

ext_train_df['Distance_To_Hydrology'] = ext_train_df.apply(lambda x: dist_hyd(x), axis = 1)
ext_train_df['Average_Distance_To_Bad_Points'] = ext_train_df.apply(lambda x: avg_dist_bad(x), axis = 1)
ext_train_df['Min_Distance_To_Bad_Points'] = ext_train_df.apply(lambda x: min_dist_bad(x), axis = 1)
ext_train_df['Average_Shade'] = ext_train_df.apply(lambda x: avg_shade(x), axis = 1)
ext_train_df['Min_Shade'] = ext_train_df.apply(lambda x: min_shade(x), axis = 1)
ext_train_df['Max_Shade'] = ext_train_df.apply(lambda x: max_shade(x), axis = 1)

ext_train_df.head()

In [None]:
plt.figure(figsize = (20, 18))

for pos, key in enumerate(('Distance_To_Hydrology', 'Average_Distance_To_Bad_Points', 'Min_Distance_To_Bad_Points', 'Average_Shade', 'Min_Shade', 'Max_Shade')):
    ax = plt.subplot(3, 2, pos+1)
    sns.violinplot(y = key, x = 'Cover_Type', data = ext_train_df)
    plt.xticks(fontsize = 12)
    plt.yticks(fontsize = 12)
    ax.xaxis.label.set_size(12)
    ax.yaxis.label.set_size(12)
    
plt.show()

# Categorical features

### Get a quick description
Compute the frequency of different wilderness areas or soil types

In [None]:
train_df.loc[:,'Wilderness_Area1':'Soil_Type40'].sum().to_frame().T/len(train_df)

In [None]:
test_df.loc[:,'Wilderness_Area1':'Soil_Type40'].sum().to_frame().T/len(test_df)

**observations**:
* Soil_Type7 and Soil_Type15 never appear in the training set -> discard them for training (note that these two soil types also appear very rarely in the test set)
* Other soil types (e.g. Soil_Type8, Soil_Type9, ...) occur with very low frequency

Check the frequency of different categories in each class

In [None]:
train_df.loc[:,'Wilderness_Area1':'Cover_Type'].groupby('Cover_Type').sum()/2160.0

In [None]:
frequencies = train_df.loc[:,'Wilderness_Area1':'Cover_Type'].groupby('Cover_Type').sum()/2160.0
frequencies = frequencies.append(train_df.loc[:,'Wilderness_Area1':'Soil_Type40'].sum().to_frame().T/len(train_df))
frequencies.rename(index={0:'all'}, inplace = True)

frequencies

Plot the distributions

In [None]:
frequencies.iloc[:,0:4].plot.barh(figsize = (15,10), stacked = True)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

In [None]:
frequencies.iloc[:,4:].plot.barh(figsize = (20,10), stacked = True)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()