In [None]:
#This Python 3 environment comes with many helpful analytics libraries installed
#It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
#For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


#Input data files are available in the "../input/" directory.
#For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Any results you write to the current directory are saved as output.

In [None]:
df_train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
df_test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')

In [None]:
df_train.head()

In [None]:
#df_test1.head()

In [None]:
df_train.dtypes

In [None]:
pd.set_option('display.max_columns', None)
df_train.describe()

Soil Type 7 and 15 are constant and to be dropped. There are no missing data.
One hot coded data like wilderness_Area and soil_type can be converted for later analysis.
Rescaling and standardisation needed for some fields.


In [None]:
df_train = df_train.drop(['Soil_Type7', 'Soil_Type15'], axis = 1)
df_test = df_test.drop(['Soil_Type7', 'Soil_Type15'], axis = 1)



In [None]:
df_train = df_train.iloc[:,1:]
df_test = df_test.iloc[:,1:]

## Correlation Formula
* r = ((x-mean(x))(y-mean(y)))/sqrt(sum(sqr(x-mean(x))))(sum(sqr(y-mean(y)))))

In [None]:
size = 10
corrmat = df_train.iloc[:, :size].corr()
f, ax = plt.subplots(figsize = (10,8))
sns.heatmap(corrmat, vmax = 0.8, square = True)

Correlation Values

In [None]:
data = df_train.iloc[:, :size]
cols = data.columns
#Running pearson coefficient for all combinations
data_corr = data.corr()
threshold = 0.5
corr_list = []

In [None]:
data_corr

In [None]:
#sorting the highly correlated values
for i in range(0, size):
    for j in range(i+1, size):
        if data_corr.iloc[i, j] >= threshold and data_corr.iloc[i, j]<1\
        or data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j]<=-threshold:
            corr_list.append([data_corr.iloc[i,j],i,j])

In [None]:
#Sorting values
s_corr_list = sorted(corr_list, key = lambda x: -abs(x[0]))

#print the higher values
for v, i, j in s_corr_list:
    print("%s and %s = %.2f" % (cols[i], cols[j], v))

**SKEWNESS**

In [None]:
df_train.iloc[:, :10].skew()

**Data Visualization**

In [None]:
for v, i, j in s_corr_list:
    sns.pairplot(data = df_train, hue = 'Cover_Type', size = 6, x_vars = cols[i], y_vars = cols[j])
    plt.show()

In [None]:
# A violin plot is a hybrid of a box plot and a kernel density plot, which shows peaks in the data.
cols = df_train.columns
size = len(cols) - 1 # We don't need the target attribute
# x-axis has target attributes to distinguish between classes
x = cols[size]
y = cols[0:size]

for i in range(0, size):
    sns.violinplot(data=df_train, x=x, y=y[i])
    plt.show()

* Elevation has a seperate distribution for each class, hence an important attribute for prediction
* Aspect plot contains couple of normal distribution for several classes
* Horizontal distance to hydrology and roadways is quite similar
* Hillshade 9am and 12pm displays left skew (long tail towards left)
* Wilderness_Area3 gives no class distinction. As values are not present, others give some scope to distinguish
* Soil_Type, 1,5,8,9,12,14,18-22, 25-30 and 35-40 offer class distinction as values are not present for many classes
* From the violin graph soiltype_8 is present only at covertype 2 and is very  negligible to consider.
* soiltype_25 is present only at covertype2 and is very negligible to consider.

In [None]:
df_train.Wilderness_Area2.value_counts()

Too many zero values means attributes like it shows class distinction

In [None]:
### Group one-hot encoded variables of a category into one single variable
cols = df_train.columns
r,c = df_train.shape

# Create a new dataframe with r rows, one column for each encoded category, and target in the end
new_data = pd.DataFrame(index= np.arange(0,r), columns=['Wilderness_Area', 'Soil_Type', 'Cover_Type'])

# Make an entry in data for each r for category_id, target_value
for i in range(0,r):
    p = 0;
    q = 0;
    # Category1_range
    for j in range(10,14):
        if (df_train.iloc[i,j] == 1):
            p = j-9 # category_class
            break
    # Category2_range
    for k in range(14,54):
        if (df_train.iloc[i,k] == 1):
            q = k-13 # category_class
            break
            # Make an entry in data for each r
    new_data.iloc[i] = [p,q,df_train.iloc[i, c-1]]
    
# plot for category1
sns.countplot(x = 'Wilderness_Area', hue = 'Cover_Type', data = new_data)
plt.show()

# Plot for category2
plt.rc("figure", figsize = (25,10))
sns.countplot(x='Soil_Type', hue = 'Cover_Type', data= new_data)
plt.show()

Wilderness_Area4 has lot of presence of cover_type 4, good class distinction
SoilType 1-6,9-13,15, 20-22, 27-31,35,36-38 offer lot of class distinction as counts for some are very high

**Data Preparation**
* Check for Data Transformation

Some of the soil_types are present in very fewer cover_types.

In [None]:
# Checking the value count for different soil_types
for i in range(10, df_train.shape[1]-1):
    j = df_train.columns[i]
    print (df_train[j].value_counts())

In [None]:
df_train = df_train.drop(['Soil_Type8', 'Soil_Type25'], axis=1)
df_test = df_test.drop(['Soil_Type8', 'Soil_Type25'], axis=1)
df_train1 = df_train # To be used for algos like SVM where we need normalization and StandardScaler
df_test1 = df_test # To be used under normalization and StandardScaler

**Normality**
* (Needed only for few ML algorithms like SVM)

In [None]:
# Checking for data transformation (take only non-categorical values)
df_train.iloc[:,:10].skew()

Data Transformation needed in 

In [None]:
from scipy import stats
plt.figure(figsize =(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Hydrology'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Hydrology'], plot=plt)

Positive Skewness. Square root or log transformation is required.

In [None]:
df_train1['Horizontal_Distance_To_Hydrology'] = np.sqrt(df_train1['Horizontal_Distance_To_Hydrology'])

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Hydrology'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Hydrology'], plot=plt)

In [None]:
#Vertical_Distance_To_Hydrology
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Vertical_Distance_To_Hydrology'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Vertical_Distance_To_Hydrology'], plot=plt)

Positive skewness

In [None]:
#Horizontal_Distance_To_Roadways
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Roadways'], fit=stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Roadways'], plot=plt)

Shows Positive skewness

In [None]:
df_train1['Horizontal_Distance_To_Roadways'] = np.sqrt(df_train1['Horizontal_Distance_To_Roadways'])

In [None]:
# Plot again after sqrt transformation
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Roadways'], fit = stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Roadways'], plot=plt)

In [None]:
plt.figure(figsize=(8, 6))
sns.distplot(df_train1['Hillshade_9am'], fit=stats.norm)
plt.figure(figsize=(8, 6))
res = stats.probplot(df_train1['Hillshade_9am'], plot = plt)

Negative Skewness detected.
Performing square transform.

In [None]:
df_train['Hillshade_9am'] = np.square(df_train1['Hillshade_9am'])

In [None]:
plt.figure(figsize = (8,6))
sns.distplot(df_train['Hillshade_9am'], fit = stats.norm)
fig = plt.figure(figsize = (8,6))
res = stats.probplot(df_train1['Hillshade_9am'], plot = plt)

Reasonable improvement is seen

In [None]:
# Hillshade_Noon
fig = plt.figure(figsize=(8,6))
sns.distplot(df_train1['Hillshade_Noon'],fit=stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Hillshade_Noon'],plot=plt)

Negative skewness present

In [None]:
df_train1['Hillshade_Noon'] = np.square(df_train1['Hillshade_Noon'])

In [None]:
# Plot again after square transformation
fig = plt.figure(figsize=(8,6))
sns.distplot(df_train1['Hillshade_Noon'],fit=stats.norm)
fig = plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Hillshade_Noon'],plot=plt)


In [None]:
# Horizontal_Distance_To_Fire_Points
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Fire_Points'], fit=stats.norm)
plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Fire_Points'],plot=plt)

Shows positive skewness

In [None]:
df_train1['Horizontal_Distance_To_Fire_Points'] = np.sqrt(df_train1['Horizontal_Distance_To_Fire_Points'])

In [None]:
# Plot again after sqrt transformation
plt.figure(figsize=(8,6))
sns.distplot(df_train1['Horizontal_Distance_To_Fire_Points'], fit=stats.norm)
plt.figure(figsize=(8,6))
res = stats.probplot(df_train1['Horizontal_Distance_To_Fire_Points'],plot=plt)

Determined significant improvement in the plot.

In [None]:
# To be used in case of algorithms like SVM
df_test1[['Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Fire_Points'\
        ,'Horizontal_Distance_To_Roadways']] = np.sqrt(df_test1[['Horizontal_Distance_To_Hydrology',\
        'Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Roadways']])

In [None]:
# To be used in case of algorithms like SVM
df_test1[['Hillshade_9am','Hillshade_Noon']] = np.square(df_test1[['Hillshade_9am','Hillshade_Noon']])


## **Test and Train Data**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
#non categorical variables only

Size = 10
X_train_temp = df_train.iloc[:,:Size]
X_test_temp = df_test.iloc[:,:Size]
X_train_temp1 = df_train1.iloc[:,:Size]
X_test_temp1 = df_test1.iloc[:,:Size]

X_train_temp1 = StandardScaler().fit_transform(X_train_temp1)
X_test_temp1 = StandardScaler().fit_transform(X_test_temp1)

In [None]:
df_train1.iloc[:,:]

In [None]:
r,c = df_train.shape
X_train = np.concatenate((X_train_temp,df_train.iloc[:,Size:c-1]),axis=1)
X_train1 = np.concatenate((X_train_temp1, df_train1.iloc[:,Size:c-1]), axis=1) # to be used for SVM
y_train = df_train.Cover_Type.values

## ML Algorithm



## Support Vector Machine
* SVM can be used for both regression and Classification problems
* SVM creates a hyplane or a line that separates data by the classes.
* Then it analyses at which part the test data will be falling.

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
#In the new version these are in the model_selection module. Use this: from sklearn.model_selection import learning_curve, GridSearchCV.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
x_data, x_test_data, y_data, y_test_data = train_test_split(X_train1,y_train,test_size=0.2, random_state=123)
svm_para = [{'kernel':['rbf'],'C': [1,10,100,100]}]

* We tried two kernels: polynomial and radial basis function. Result obtained with RBF got us a score of 0.49844 in Kaggle, while linear got us 0.4718. Flipping a coin has a better result...
* rbf or rdial basis function is the gaussian kernel

In [None]:
classifier = GridSearchCV(svm.SVC(),svm_para,cv=3,verbose=2)
classifier.fit(x_data,y_data)
classifier.best_params_
#classifier.best_score_

In [None]:
# Parameters optimized using the code in above cell
#C_opt = 10 # reasonable option
#clf = svm.SVC(C=C_opt,kernel='rbf')
#clf.fit(X_train1,y_train)
classifier.fit(X_train1,y_train)
classifier.score(X_train1,y_train)

* The data which are almost linearly separable SVM's can still be made to work pretty well by using the right value for c
* Linearly separable data works well with SVMs
* For data which are non linear, we can project them to a space where it is linearly separable.That is projecting it to higher dimensions.(1 to 3 or 4)

In [None]:
classifier.best_score_

In [None]:
classifier.cv_results_

Clearly, if we are using a linear classifier, we are never going to be able to perfectly separate the labels. We also don’t want to discard the linear classifier altogether because it does seem like a good fit for the problem except for a few errant points.
## How do SVMs deal with this? They allow you to specify how many errors you are willing to accept?

You can provide a parameter called “C” to your SVM; this allows you to dictate the tradeoff between:
Having a wide margin.
Correctly classifying training data. A higher value of C implies you want lesser errors on the training data.
It bears repeating that this is a tradeoff. You get better classification of training data at the expense of a wide margin.
## How SVM finds a Classifier?
* Case 1: If there are n points in our dataset,the SVM needs only the dotproduct of each pair of points to find a classifier.
* Case 2: If we want to project data to higher dimensions, SVM takes two points and gives the dot products in a projected space.

## So I project the data first and then run the SVM?
No. To make the above example easy to grasp I made it sound like we need to project the data first. The fact is you ask the SVM to do the projection for you. 

## Kernels
Kernels does the above calculation job with less computation.
* Instead of first calculating projection and then the dot product which needs 13 steps in case of 2 data points, kernel firt calculate the dotproducts and then square K(X1,X2)=((x1.X2)^2) , which is done in 4 steps.

#Linear Kernel
* When we don't use a projection , but compute only a dot product.
#Polynomial kernel
#RBF Kernal
* Radial Basis Function(Rbf) kernel
* When you want to project to infinite dimensions
* think about how we compute sums of infinite series. This is similar. There are infinite terms in the dot product, but there happens to exist a formula to calculate their sum.

## To summarize:
* We typically don’t define a specific projection for our data. Instead, we pick from available kernels, tweaking them in some cases, to find one best suited to the data.
* Of course, nothing stops us from defining our own kernels, or performing the projection ourselves, but in many cases we don’t need to. Or we at least start by trying out what’s already available.
* If there is a kernel available for the projection we want, we prefer to use the kernel, because that’s often faster.
* RBF kernels can project points to infinite dimensions.


In [None]:
#y_pred = classifier.predict(df_test1)

In [None]:
df_Test1 = pd.read_csv('../input/forest-cover-type-prediction/test.csv')

In [None]:
#solution = pd.DataFrame({'Id':df_Test1.Id, 'Cover_Type':y_pred}, columns = ['Id','Cover_Type'])
#solution.to_csv('SVMcover_sol.csv', index=False)

* The higher the gamma value it tries to exactly fit the training data set.


## Extra Tree Classifier

In an Extra Trees classifier, the features and splits are selected at random; hence, “Extremely Randomized Tree”. Since splits are chosen at random for each feature in the Extra Trees Classifier, it’s less computationally expensive than a Random Forest.

It is very similar to a Random Forest Classifier and only differs from it in the manner of construction of the decision trees in the forest.

Each Decision Tree in the Extra Trees Forest is constructed from the original training sample. Then, at each test node, Each tree is provided with a random sample of k features from the feature-set from which each decision tree must select the best feature to split the data based on some mathematical criteria (typically the Gini Index). This random sample of features leads to the creation of multiple de-correlated decision trees.




In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
x_data, x_test_data, y_data, y_test_data = train_test_split(X_train,y_train,test_size= 0.3, random_state=0)
etc_para = [{'n_estimators': [20, 30, 100], 'max_depth':[5, 10, 15], 'max_features': [0.1, 0.2, 0.3]}]
#default number of features is sqrt(n)
#default number of min_samples_leaf is 1

In [None]:
ETC = GridSearchCV(ExtraTreesClassifier(),param_grid=etc_para, cv=10, n_jobs=-1)
ETC.fit(x_data, y_data)
ETC.best_params_
ETC.best_score_

In [None]:
print ('Best accuracy obtained: {}'.format(ETC.best_score_))
print ('Parameters:')
for key, value in ETC.best_params_.items():
    print('\t{}:{}'.format(key,value))

In [None]:
# Classification Report
Y_pred = ETC.predict(x_test_data)
target = ['class1', 'class2','class3','class4','class5','class6','class7' ]
print (classification_report(y_test_data, Y_pred, target_names=target))

# Learning Curve

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(model,title, X, y,n_jobs = 1, ylim = None, cv = None,train_sizes = np.linspace(0.1, 1, 5)):
    
    # Figrue parameters
    plt.figure(figsize=(10,8))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Training Examples')
    plt.ylabel('Score')
    
    train_sizes, train_score, test_score = learning_curve(model, X, y, cv = cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    # Calculate mean and std
    train_score_mean = np.mean(train_score, axis=1)
    train_score_std = np.std(train_score, axis=1)
    test_score_mean = np.mean(test_score, axis=1)
    test_score_std = np.std(test_score, axis=1)
    
    plt.grid()
    plt.fill_between(train_sizes, train_score_mean - train_score_std, train_score_mean + train_score_std,\
                    alpha = 0.1, color = 'r')
    plt.fill_between(train_sizes, test_score_mean - test_score_std, test_score_mean + test_score_std,\
                    alpha = 0.1, color = 'g')
    
    plt.plot(train_sizes, train_score_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_score_mean, 'o-', color="g", label="Cross-validation score")
    
    plt.legend(loc = "best")
    return plt

In [None]:
# 'max_features': 0.3, 'n_estimators': 100, 'max_depth': 15, 'min_samples_leaf: 1'
etc = ExtraTreesClassifier(bootstrap=True, oob_score=True, n_estimators=100, max_depth=10, max_features=0.3, \
                           min_samples_leaf=1)

etc.fit(X_train, y_train)
# yy_pred = etc.predict(X_test)
etc.score(X_train, y_train)

In [None]:
r,c = df_test.shape
X_test = np.concatenate((X_test_temp, df_test.iloc[:,Size:c]), axis = 1)
yy_pred = etc.predict(X_test)
solution = pd.DataFrame({'Id':df_Test1.Id, 'Cover_Type':yy_pred}, columns = ['Id','Cover_Type'])
solution.to_csv('ETCcover_sol.csv', index=False)

In [None]:
# Plotting learning curve
title = 'Learning Curve (ExtraTreeClassifier)'
# cross validation with 50 iterations to have a smoother curve
cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
model = etc
plot_learning_curve(model,title,X_train, y_train, n_jobs=-1,ylim=None,cv=cv)
plt.show()