In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents:
     - Exploratory Analysis
     - Data Cleaning
     - Outlier Detection
     - Feature Engineering
     - Modeling

# Imports

In [None]:
import numpy as np
import pandas as pd
import zipfile
import seaborn as sb
import matplotlib.pyplot as plt
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

# visualization configurations
%matplotlib inline
base_color = sb.color_palette()[0]
standard = [14.70, 8.27]
panorama = [20,8.27]
plt.rcParams.update({'font.size': 12})


## Extracting data files

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        with zipfile.ZipFile(os.path.join(dirname, filename), 'r') as zip_ref:
            zip_ref.extractall(os.path.join('./kaggle/input'))


In [None]:
data_base_path = './kaggle/input/'
train_path = 'train.csv'
test_path = 'test.csv'

## Loading data

In [None]:
train_data = pd.read_csv(os.path.join(data_base_path,train_path))
print(train_data.shape)
train_data.head(5)

The train data-set is of 15,120 rows with 55 feature and 1 label.

Let's start the expolaratory analysis to find out more about the data-set.

# Expolratory Analysis

Printing the data feature names and if any is missing

In [None]:
train_data.info()

In [None]:
missing_values = train_data.isnull().sum().sum() 
print("There are {} missing values".format(missing_values))

 - Apparently there are no missing values.
 - Id is just an index, we can drop it.
 - It appears that all the features are just integeres. But with reading the data description we know that:
     - Soil_Types --> binary features (0 = absence or 1 = presence)
     - Wilderness_Areas  --> binary feature (0 = absence or 1 = presence)
     - (Elevation, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Horizontal_Distance_To_Fire_Points) --> integers represent distances
     - Slope, Aspect --> integer represent angle in degrees.
     - Hillshade_ --> Hillshade index (0 to 255 index).
     
 

#### Note:
Hillshade defniniton : "The hillshade function produces a grayscale 3D representation of the terrain surface, with the sun's relative position taken into account for shading the image. Hillshading is a technique for visualizing terrain determined by a light source and the slope and aspect of the elevation surface. It is a qualitative method for visualizing topography and does not give absolute elevation values. " more on hillshade [here](https://pro.arcgis.com/en/pro-app/2.7/help/analysis/raster-functions/hillshade-function.htm)

## correlation matrix

Fatsest way to get any insight is to plot the correlation matrix as long as the number of features allows that. In our case, it is easy to plot 56x56 matrix.

In [None]:
feature_size = train_data.shape[1]
f = plt.figure(figsize=panorama)
plt.matshow(train_data.corr(), fignum=f.number)
plt.xticks(range(feature_size), range(feature_size), fontsize=8, rotation=90)
plt.yticks(range(feature_size), range(feature_size), fontsize=8)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

We can see that there are 3 features that has zero correlation with all other features. They seem to have constant value.

### Univariant &Bivariant Exploration



#### 1 - Binary features

Since their number is high, we can get intuition about their distribution from numbers faster than visualizing histograms. 

In [None]:
def get_common_prefix_columns(data, prefix):
     """
     This function return list with features names in the data variable that share same prefix
     """     
     return [col  for col in data.columns if set(prefix).issubset(col)]


def get_binary_stats(data, prefix):
    """
    This function returns the percentage of each unique value in features in data variable that share same prefix.
    It is built to be used for scaning the distribution of binary features
    """
    features = get_common_prefix_columns(data, prefix)
    for feature in features:
        value_counts = data[feature].value_counts(normalize = True).to_dict()
        print(feature, value_counts)

In [None]:
get_binary_stats(train_data, prefix = 'Wilderness_Area')

 - wilderness_Area2 is the most uncommon area to be present.
 - Area3 and Area2 are the most two common areas.

In [None]:
get_binary_stats(train_data, prefix = 'Soil_Type')

 - Soil types 7 and 15 are never present. Therefore we can ignore these 2 features.
 - A lot of Soil features have 99+ % as zeros. These features maybe useless to predict the Cover Type. 

#### 2 - Hillshade features.

Since they can be any value between 0-255, it is better to visualize it bu histograms.

In [None]:
train_data.hist(get_common_prefix_columns(train_data, prefix = 'Hillshade'), bins = 25);
plt.tight_layout()

 - we can see how the mean value is nearly the same in 9am and Noon, but shifted at 3pm. Let's see the absolute numbers.

In [None]:
train_data[get_common_prefix_columns(train_data, prefix = 'Hillshade')].describe()

It will be interesting to see if the shift happens in all cover types (our label)?

Let's print the mean for each Hillshade and for cover type. 

In [None]:
train_data.groupby('Cover_Type').mean()[get_common_prefix_columns(train_data, prefix = 'Hillshade')]

- We can see that the Hillshade mean shifts for all the Cover Types.

Let's see the features fistribution and their summary statistics (mean, IQR) in a violin plot. We can build a function to call it for each distance.

In [None]:
def draw_box_plot_per_label(data, feature, label):
    plt.figure(figsize = standard)
    Means = data.groupby(label)[feature].mean()
    sb.violinplot(y = data[feature], x = train_data[label], inner='quartile');
    plt.scatter(x=range(len(Means)),y=Means,c="k")

In [None]:
draw_box_plot_per_label(train_data, 'Hillshade_9am', 'Cover_Type')

In [None]:
draw_box_plot_per_label(train_data, 'Hillshade_Noon', 'Cover_Type')

In [None]:
draw_box_plot_per_label(train_data, 'Hillshade_3pm', 'Cover_Type')

    - The distribution per Cover Type do not tell much.

### 3 - Distances

In [None]:
get_common_prefix_columns(train_data, prefix = 'Distance')

from the features names, we can see that the first two are related, maybe better to be visualized by 2D points.

In [None]:
plt.figure(figsize = standard)
sb.scatterplot(x = train_data['Horizontal_Distance_To_Hydrology'], y=train_data['Vertical_Distance_To_Hydrology'], alpha = 0.5);

    - distances in the range of [0,400] appear mroe than other values in the data-set
    - vertical and horizontal distance seems to be directly porpotional. But, Let's add more flavour to the plot and see how these 2 features interact with the Cover Type.

In [None]:
plt.figure(figsize = standard)
sb.scatterplot(x = train_data['Horizontal_Distance_To_Hydrology'], y=train_data['Vertical_Distance_To_Hydrology'],
                hue = train_data['Cover_Type'], palette = 'tab10');

    - Almost all of type 3 clustered in x = [0,400] and y =[0,300] --> smaller distances than other types.
    - Types [1,2,7] appear over the whole x values.
    - There are 3 instances of type 2 have larger y values than the whole data-set. Maybe needs further investigation.

Now, we still have horizontal distance to roadways, fire points. Since they are horizontal let's add horizontal distance to hydrology to the comparison.

In [None]:
Horizontal_distances = get_common_prefix_columns(train_data, prefix = 'Horizontal')

In [None]:
train_data[Horizontal_distances].describe()

    - In general, Hydrology is the most near between these distances. 
    - The maximum distances indicate that maybe there are some outliers. --> let's investigate by the historgram and box plot of each distance.

In [None]:
def draw_hist_box_distances(distance):
    fig, ax =plt.subplots(1,2, figsize = standard)
    sb.histplot(x = train_data[distance], ax = ax[0]);
    sb.boxplot(x = train_data[distance], ax = ax[1]);
    plt.tight_layout()

In [None]:
draw_hist_box_distances('Horizontal_Distance_To_Hydrology')

In [None]:
draw_hist_box_distances('Horizontal_Distance_To_Fire_Points')

In [None]:
draw_hist_box_distances('Horizontal_Distance_To_Roadways')

    - The three distances are left skewed, regradless of their order of magnitude.
    - The three distances have outliers. --> we can make sure later by the features Z-score.

After exploring the horizontal distance individually (univariant), let's explore its interaction with Cover Type (BiVariant).

Since we need to visualize the distribution for each distance for each covertype, we will use Violinplots. We can draw additional point at the mean value for each cover type to make the plot more informative.

In [None]:
draw_box_plot_per_label(train_data, 'Horizontal_Distance_To_Hydrology', 'Cover_Type')

     - Cover Type 4 looks the most unique and the most probable to be if the value is between [0,100]

In [None]:
draw_box_plot_per_label(train_data, 'Horizontal_Distance_To_Roadways', 'Cover_Type')

    - This feature can help us eliminate the Cover Types 3,4,6 if the distance to roadways exceeds 4000.

In [None]:
draw_box_plot_per_label(train_data, 'Horizontal_Distance_To_Fire_Points', 'Cover_Type')

    - These three plots gives the intuition that mean distance of the three distances could be a useful feature to be engineered.

### 4- Elevation

We will start by the feature statsitics, and visualizing a historgram.

In [None]:
train_data['Elevation'].describe()

In [None]:
plt.figure(figsize = standard)
sb.histplot(x = train_data['Elevation'])

    - The histogram looks like following tri-modal distribution. Yet, visualizing the distribution per each Cover Type may help more.

In [None]:
plt.figure(figsize = standard)
sb.kdeplot(x = train_data['Elevation'], hue = train_data['Cover_Type'], palette = 'tab10');

    - Elevation seems like the perfect feature to differentiate between Cover Types 4,5,7 as there kernel estimation almost do not overlap.
    - It appears now that the distribtuion of Cover types 4,5,7 are the reason for the trimodal distribtuion appeared in the histogram above.

### 6- Slop, Aspect

In [None]:
degree_features = ['Slope','Aspect']

In [None]:
train_data[degree_features].describe()

     - There 5 number summary shows summary to each feature by itself. We can see they have different scale. Slope is in order of tens, while Aspect is in order of hundreds.
     - We can go further and plot their distribution to further investigate how each feature distribution looks like. We will use Kernel density estimation  as it is easier to visualize more than density estimation in one plot, so we can plot the distribution of each feature for each Cover Type.

In [None]:
plt.figure(figsize = standard)
sb.kdeplot(x = train_data['Slope'], color = base_color, hue = train_data['Cover_Type'], palette ='tab10');

     - We can see different distribution for each Cover Type. Some are left skewed bell curves and others are bimodal.

In [None]:
plt.figure(figsize = standard)
sb.kdeplot(x = train_data['Aspect'], color = base_color, hue = train_data['Cover_Type'], palette ='tab10');

     - The Aspect follow bimodal distribution for all Cover Types.

# Data cleaning

There is no much to do in cleaning the data, already clean. But maybe:
        - remove the zero correlation faetures with the label, and data index.

In [None]:
def drop_features(data, features):
    for feature in features:
        if feature in data.columns:
            data = data.drop(columns = [feature])
    return data

In [None]:
train_data = drop_features(train_data, ['Id','Soil_Type7','Soil_Type8','Soil_Type15'])

# Outlier Detection

Using Z-score with the continuous features to detect outliers. We will keep any data point that lies between +/-3 standard deviations from the mean value.

3 std from means that --> 99.7% of the data points lie between +/- 3 standard deviation

In [None]:
features = get_common_prefix_columns(train_data, 'Distances')
others = ['Elevation','Slope','Aspect']

for other in others:
    features.append(other)

features

In [None]:
data_zscore = pd.DataFrame()
for feature in features[:-1]:
    data_zscore[feature] = train_data[feature]
    data_zscore[feature+"_zscore"] = (train_data[feature] - train_data[feature].mean()) / train_data[feature].std(ddof=0)

In [None]:
for feature in features[:-1]:
    data_zscore[feature+"_outlier"] = (abs(data_zscore[feature+"_zscore"])>3).astype(int)

In [None]:
data_zscore.columns

In [None]:
indicies_set = set()
for col in data_zscore.columns:
    if "outlier" in col:
        indicies_set.update(data_zscore.loc[data_zscore[col]==1].index.to_list())

In [None]:
len(list(indicies_set)) / train_data.shape[0]

nearly 6% of the data makred as having at least one outlier in the mentioned features using Z-score with 3 standard deviations from the mean.

These data-points could be treated by replacing the values by the mode value in their corresponding label, but for simplicity we will just drop them in this notebook.

In [None]:
train_data = train_data.drop(indicies_set)

In [None]:
train_data.shape

# Feature Engineering

There are some new features that could be engineered from the features we explored. let's define them mathmatically for now:

    - Diagonal distance to Hydrology.
         - Just by intuition, if we have the horizontal and vertical distances, let's calculate their diagonal distance
    - Mean distance from water, fire, and road.
        - Since water, fire  and road are the main sources of services in the data-set, maybe the average distance to all three can reflect a useful number.
    
    
    
There are some other ideas, but with the lack of description for the data-set it is hard to be more creative about engineering more featurs.
    - I can see that Vertical distance to Hydrology has something to do with Elevation, but can't get a relation between them.

In [None]:
def diagonal_distance(data,x,y):
    return np.sqrt(data[x]**2 + data[y]**2)


def mean_value(data,feature_list):
    sum_ = 0
    for feature in feature_list:
        sum_ += data[feature]
    return sum_ / len(feature_list)

In [None]:
Hydrology_distances = get_common_prefix_columns(train_data, 'Hydrology')
train_data['Diagonal_Distance_To_Hydrology'] = diagonal_distance(train_data, Hydrology_distances[0], Hydrology_distances[1])

In [None]:
Horizontal_distances = get_common_prefix_columns(train_data, 'Horizontal')
train_data['Mean_Horizontal_Distance'] = mean_value(train_data,Horizontal_distances)

# Modeling

## Data preparation

In [None]:
train_data.shape

In [None]:
Y = train_data['Cover_Type']
X = train_data.drop(columns=['Cover_Type'])
print("train data is of shape {} with {} labels".format(X.shape,Y.shape))

Now, since we agreed to the previous data cleaning, and feature engineering steps, we can build a function to do that transformation and use it over the test data.

In [None]:
def transform_features(data):
    data = drop_features(data, ['Id','Soil_Type7','Soil_Type8','Soil_Type15'])
    
    Horizontal_distances = get_common_prefix_columns(data, 'Horizontal')
    data['Mean_Horizontal_Distance'] = mean_value(data,Horizontal_distances)
    
    Hydrology_distances = get_common_prefix_columns(data, 'Hydrology')
    data['Diagonal_Distance_To_Hydrology'] = diagonal_distance(data, Hydrology_distances[0], Hydrology_distances[1])
    
    return data

## Model Selection

We will use Random Forest as baseline model for this notebook.

In [None]:
def train_predict(clf, X_train, X_valid, y_train):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    return y_pred

def evaluate(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [None]:
clf = RandomForestClassifier(random_state = 0)
strtfdKFold = StratifiedKFold(n_splits=10)
kfold = strtfdKFold.split(X, Y)
scores = []

X = transform_features(X)

for k, (train, valid) in enumerate(kfold):
    y_pred = train_predict(clf, X.iloc[train, :], X.iloc[valid, :], Y.iloc[train])
    score = evaluate(Y.iloc[valid], y_pred)
    scores.append(score)
    print('Fold: %2d,  Accuracy: %.3f' % (k+1, score))
 
print('\n\nCross-Validation accuracy: %.3f +/- %.3f' %(np.mean(scores), np.std(scores)))

using 10 startified folds to ensure the robustness of the model.

In [None]:
test_data = pd.read_csv(os.path.join(data_base_path,test_path))
print(test_data.shape)
test_data.head(5)

## construct submission file

In [None]:
test_df = pd.DataFrame()
test_df['Id'] = test_data['Id']
test_df.shape

In [None]:
test_data = transform_features(test_data)
test_data.shape

In [None]:
y_test = clf.predict(test_data)

In [None]:
test_df['Cover_Type'] = y_test
test_df.to_csv('submission.csv', index = False)