First attempt at classifying tree types

**Data Input**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input/train.csv"]).decode("utf8"))

## Load in train dataset
## Columns should all be int64
train = pd.read_csv("../input/train.csv")

# Any results you write to the current directory are saved as output.

**QC of input data** 

 - Confirm number of rows/columns
 - Confirm data types (int64)

In [None]:
print(train.shape)
print(train.dtypes)

**Statistical Exploration** 

 - Count
 - Mean
 - Std
 - Min
 - 25%
 - 50%
 - 75%
 - Max

In [None]:
## Set option so that all columns are displayed
pd.set_option('display.max_columns', None)
print(train.describe())

**Soil Type 7 and 15 have no values, and can be removed**

In [None]:
train = train.drop('Id', 1)
train = train.drop('Soil_Type7', 1)
train = train.drop('Soil_Type15', 1)

In [None]:
print(train.shape)
print(list(train))

**Basic Statistics 2**

 - Correlation

In [None]:
#sets the number of features considered
size = 10 
#create a dataframe with only 'size' features
data=train.iloc[:,:size] 
#get the names of all the columns
cols=data.columns 
# Calculates pearson co-efficient for all combinations
data_corr = data.corr()
# Set the threshold to select only only highly correlated attributes
threshold = 0.5
# List of pairs along with correlation above threshold
corr_list = []
#Search for the highly correlated pairs
for i in range(0,size): #for 'size' features
    for j in range(i+1,size): #avoid repetition
        if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
            corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index
#Sort to show higher ones first            
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))
#Print correlations and column names
for v,i,j in s_corr_list:
    print ("%s and %s = %.2f" % (cols[i],cols[j],v))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot of only the highly correlated pairs
for v,i,j in s_corr_list:
    sns.pairplot(train, hue="Cover_Type", size=5, x_vars=cols[i],y_vars=cols[j] )
    plt.show()

**Box/Density Plot Examples**

In [None]:
cols = train.columns
#number of attributes (exclude target)
size = len(cols)-1
#x-axis has target attribute to distinguish between classes
x = cols[size]
#y-axis shows values of an attribute
y = cols[0:size]
#Plot violin for all attributes
for i in range(0,size):
    sns.violinplot(data=train,x=x,y=y[i])  
    plt.show()

Elevation shows some variance between land cover types. 
Soil_Type, 1,5,8,9,12,14,18-22, 25-30 and 35-40 offer class distinction as values are not present for many classes

**Data Prep**
 - StandardScaler 
 - MinMaxScaler 
 - Normalizer

In [None]:
import warnings
warnings.filterwarnings('ignore')
#get the number of rows and columns
r, c = train.shape
#get the list of columns
cols = train.columns
#create an array which has indexes of columns
i_cols = []
for i in range(0,c-1):
    i_cols.append(i)
#array of importance rank of all features  
ranks = []
#Extract only the values
array = train.values
#Y is the target column, X has the rest
X = array[:,0:(c-1)]
Y = array[:,(c-1)]
#Validation chunk size
val_size = 0.1
#Use a common seed in all experiments so that same chunk is used for validation
seed = 0
#Split the data into chunks
from sklearn import cross_validation
X_train, X_val, Y_train, Y_val = cross_validation.train_test_split(X, Y, test_size=val_size, random_state=seed)
#Import libraries for data transformations
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
#All features
X_all = []
#Additionally we will make a list of subsets
X_all_add =[]
#columns to be dropped
rem = []
#indexes of columns to be dropped
i_rem = []
#List of combinations
comb = []
comb.append("All+1.0")
#Add this version of X to the list 
X_all.append(['Orig','All', X_train,X_val,1.0,cols[:c-1],rem,ranks,i_cols,i_rem])
#point where categorical data begins
size=10
#Standardized
#Apply transform only for non-categorical data
X_temp = StandardScaler().fit_transform(X_train[:,0:size])
X_val_temp = StandardScaler().fit_transform(X_val[:,0:size])
#Concatenate non-categorical data and categorical
X_con = numpy.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = numpy.concatenate((X_val_temp,X_val[:,size:]),axis=1)
#Add this version of X to the list 
X_all.append(['StdSca','All', X_con,X_val_con,1.0,cols,rem,ranks,i_cols,i_rem])
#MinMax
#Apply transform only for non-categorical data
X_temp = MinMaxScaler().fit_transform(X_train[:,0:size])
X_val_temp = MinMaxScaler().fit_transform(X_val[:,0:size])
#Concatenate non-categorical data and categorical
X_con = numpy.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = numpy.concatenate((X_val_temp,X_val[:,size:]),axis=1)
#Add this version of X to the list 
X_all.append(['MinMax', 'All', X_con,X_val_con,1.0,cols,rem,ranks,i_cols,i_rem])
#Normalize
#Apply transform only for non-categorical data
X_temp = Normalizer().fit_transform(X_train[:,0:size])
X_val_temp = Normalizer().fit_transform(X_val[:,0:size])
#Concatenate non-categorical data and categorical
X_con = numpy.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = numpy.concatenate((X_val_temp,X_val[:,size:]),axis=1)
#Add this version of X to the list 
X_all.append(['Norm', 'All', X_con,X_val_con,1.0,cols,rem,ranks,i_cols,i_rem])
#Impute
#Imputer is not used as no data is missing
#List of transformations
trans_list = []
for trans,name,X,X_val,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all:
    trans_list.append(trans)