In [None]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../input/forest-cover-type-prediction/train.csv")
train.head()

In [None]:
train.drop('Id', axis= 'columns', inplace=True)
train.head()

In [None]:
train.shape

In [None]:
train.dtypes

### Data Exploration 
#### Feature Statistics
* Part 1 : Describe numerical features
* Part 2 : Describe binary/categorical features

In [None]:
# extract all numerical features from train
num_features = train.iloc[:,:10]

#extract all binary features from train
cat_features = train.iloc[:, 10:-1]

In [None]:
num_features.describe()

In [None]:
cat_features.describe()

In [None]:
skew = train.skew()
skew_df = pd.DataFrame(skew, index=None, columns=['Skewness'])

In [None]:
print(skew)

In [None]:
fig, ax=plt.subplots(figsize=(15,7))
sns.barplot(x=skew_df.index, y = 'Skewness', data=skew_df)
var = plt.xticks(rotation=90)

In [None]:
train.groupby('Cover_Type').size()

In [None]:
# Boxplot

sns.set_style('whitegrid')

plt.subplots(figsize=(15,7))
color = sns.color_palette('pastel')
sns.boxplot(data=num_features, orient='h', palette=color)
plt.title('Spread of Data in Numerical Features', size=18)
plt.xlabel('# of Observations', size=16)
plt.ylabel('Features', size=16)

plt.xticks(size=12)
plt.yticks(size=12)

sns.despine()
plt.show()

In [None]:
wild_data, soil_data = cat_features.iloc[:,:4], cat_features.iloc[:,4:]

sns.set_style('darkgrid', {'grid color':'1'})
flatui = ['#e74c3c', '#34495e', '#2ecc71', '#3498db']

platette = sns.color_palette(flatui)

wild_data.sum().plot(kind='bar', figsize=(15,7), color='#34a028')
plt.title=('# of Observations of Wilderness Areas')
plt.xlabel('Wilderness Area', size=16)
plt.ylabel('# of Obersvations', size=16)

plt.xticks(rotation='horizontal', size=12)
plt.yticks(size=12)

plt.show()

In [None]:
wild_data.sum()

In [None]:
sns.set_style('darkgrid', {'grid.color':'.1'})

soil_data.sum().plot(kind='bar', figsize=(20,12), color='#a87539')
plt.title('# of Observations of soil Types')
plt.xlabel('Soil Types', size=16)
plt.ylabel('# of Obeservations', size=16)
plt.xticks(rotation=90,size=14)
plt.yticks(size=14)

sns.despine()
plt.show()

In [None]:
soil_data.loc[:,'Soil_Type10'].describe()

In [None]:
sns.set_style('darkgrid', {'grid_color':'.1'})

soil_sum= pd.Series(soil_data.sum())
soil_sum.sort_values(ascending=False, inplace=True)

soil_sum.plot(kind='barh', figsize=(22,12), color="#a87539")
plt.gca().invert_yaxis()
plt.title('# of Observations of Soil Types', size=18)
plt.xlabel('# of Observations', size=16)
plt.ylabel('Soil Types', size=16)

plt.xticks(roation='horizontal', size=12)
plt.yticks(size=12)

sns.despine()
plt.show()

In [None]:
# plot a violin Plot

sns.set_style('darkgrid', {'grid_color':'.1'})

target = train['Cover_Type']

features=num_features.columns

for i in range(0, len(features)):
    plt.subplots(figsize=(15,7))
    sns.violinplot(data=num_features, x=target, y=features[i])
    plt.xticks(size=14)
    plt.yticks(size=14)
    plt.xlabel('Forest Cover Types', size=16)
    plt.ylabel(features[i], size=16)
    
    plt.show()
    

#### Violin Plot 4.2 Wilderness Area Inferences:
* Wilderness_Area1 belongs to forest Cover_Type1, Cover_Type2, and Cover_Type5
* Wilderness_Area3 belongs to all classes except Cover_Type4.
* Wilderness_Area2 and Wilderness_Area4 has the least observations, their dense is less on 1 on all classes compared to Wilderness_Area1 and Wilderness_Area3.

In [None]:
# plot violin Plot

sns.set_style('darkgrid', {'grid_color':'.1'})

# set target variable
target = train['Cover_Type']

# features to be compared to the variable

features = wild_data.columns

for i in range(0, len(features)):
    
    plt.subplots(figsize=(13,9))
    sns.violinplot(data=wild_data, x=target, y=features[i])
    plt.xticks(size=14)
    plt.yticks(size=16)
    plt.xlabel('Forest Cover Types', size=16)
    plt.ylabel(features[i], size=16)
    
    plt.show()


In [None]:
sns.set_style('darkgrid', {'grid_color':'.1'})

target = train['Cover_Type']

features = soil_data.columns

for i in range(0,len(features)):
    
    plt.subplots(figsize=(13,9))
    sns.violinplot(data=soil_data, x=target, y=features[i])
    plt.xlabel('Forest Cover Types')
    plt.ylabel(features[i], size=16)
    plt.xticks(size=14)
    plt.yticks(size=14)
    
    plt.show()

#### Feature Correlation
Part of our data is binary. A correlation matrix requires continuous data, so we will exclude binary data.

* Features that less or no correlation will be indicated by the color black.
* Features with positive correlation are colored orange.
* Features with negative correlation are colored blue.

In [None]:
plt.subplots(figsize=(15,10))

num_features_corr = num_features.corr()

# generate a mask for the upper triangle range
mask = np.zeros_like(num_features_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(num_features_corr, mask=mask, center=0, square=True, annot=True, annot_kws={'size':15}, cbar_kws={'shrink': .8})
plt.xticks(size=12)
plt.yticks(size=12)

plt.show()

#### Scatterplot Features with Correlation greater that 0.5

Let's look at the paired features with correlation greater than 0.5. These will be feature pairs with a positive correlation

In [None]:
sns.set_style('darkgrid', {'grid_color': '.1'})

# paired features with positive correlation
list_data_corr = [['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'],
                  ['Elevation','Horizontal_Distance_To_Roadways'],
                  ['Aspect','Hillshade_3pm'],
                  ['Hillshade_3pm', 'Hillshade_Noon']]

for i, j in list_data_corr:
    plt.subplots(figsize=(15,12))
    sns.scatterplot(data=train, x=i, y=j, hue='Cover_Type', legend='full', palette='rainbow_r')
    plt.xticks(size=12)
    plt.yticks(size=12)
    plt.xlabel(i, size=12)
    plt.ylabel(j, size=12)
    
    plt.show()
              
                   