In [1]:
#Importing required Python packages
import matplotlib.pylab as plt
import numpy as np
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier  

import pandas as pd
np.random.seed(42)



In [2]:
# Reading the data into Python Dataframes
traincsv=pd.read_csv('train.csv', index_col='Id')
testcsv=pd.read_csv('test.csv', index_col='Id')

print traincsv.shape, testcsv.shape
print type(traincsv), type(testcsv)

(15120, 55) (565892, 54)
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [3]:
#Splitting the data in the ratio 80:20, the data is converted from Dataframes to numpy ndarray
X_train, X_test, y_train, y_test = train_test_split(traincsv.iloc[:,:-1].values, traincsv.iloc[:,-1:].values.ravel(), 
                                                    test_size=0.2)

print X_train.shape, X_test.shape, y_train.shape, y_test.shape
print type(X_train), type(X_test), type(y_train), type(y_test)

(12096, 54) (3024, 54) (12096,) (3024,)
<type 'numpy.ndarray'> <type 'numpy.ndarray'> <type 'numpy.ndarray'> <type 'numpy.ndarray'>


In [4]:
#Fitting data to Classifiers, used Max Depth and Estimators based on experience and other factors
RFC1=RandomForestClassifier(max_depth=6, n_estimators=800)
RFC1.fit(X_train, y_train)

RFC2=RandomForestClassifier(max_depth=14, n_estimators=1400)
RFC2.fit(X_train, y_train)

RFC3=RandomForestClassifier(max_depth=10, n_estimators=1100)
RFC3.fit(X_train, y_train)

ETC1= ExtraTreesClassifier(max_depth=6, n_estimators=800)
ETC1.fit(X_train, y_train)

ETC2= ExtraTreesClassifier(max_depth=14, n_estimators=1400)
ETC2.fit(X_train, y_train)

ETC3= ExtraTreesClassifier(max_depth=10, n_estimators=1100)
ETC3.fit(X_train, y_train)

GBC1= GradientBoostingClassifier(max_depth=6, n_estimators=800)
GBC1.fit(X_train, y_train)

GBC2= GradientBoostingClassifier(max_depth=14, n_estimators=1400)
GBC2.fit(X_train, y_train)

GBC3= GradientBoostingClassifier(max_depth=10, n_estimators=1100)
GBC3.fit(X_train, y_train)

XGB1= XGBClassifier(max_depth=6, n_estimators=800)
XGB1.fit(X_train, y_train)

XGB2= XGBClassifier(max_depth=14, n_estimators=1400)
XGB2.fit(X_train, y_train)

XGB3= XGBClassifier(max_depth=10, n_estimators=1100)
XGB3.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=1100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [5]:
# Data Stacking
estimators = [RFC1,RFC2,RFC3, ETC1,ETC2,ETC3, GBC1,GBC2,GBC3, XGB1,XGB2,XGB3]

# Creating empty lists to hold Predictions of the Classifiers
X_blend_train   =  []
X_blend_test    =  []
X_blend_testcsv =  []

# The lists are appended with the Classifier predictions, the list will have 12 arrays 
# which are predictions of 12 classifiers.
for i, est in enumerate(estimators):
    #print i , est
    X_blend_train.append(est.predict(X_train))
    X_blend_test.append(est.predict(X_test))
    X_blend_testcsv.append(est.predict(testcsv.values))

In [None]:
print len(X_blend_train), len(X_blend_test), len(X_blend_testcsv)
print type(X_blend_train), type(X_blend_test), type(X_blend_testcsv)

print X_blend_train
print X_blend_test
print X_blend_testcsv

# Transforming the Lists which have the Classifier predictions
X_blend_train = np.array(X_blend_train).T
X_blend_test = np.array(X_blend_test).T
X_blend_testcsv = np.array(X_blend_testcsv).T

print len(X_blend_train), len(X_blend_test), len(X_blend_testcsv)
print X_blend_train.shape, X_blend_test.shape, X_blend_testcsv.shape
print type(X_blend_train), type(X_blend_test), type(X_blend_testcsv)

print '---------------------------------'
print X_blend_train
print '---------------------------------'
print X_blend_test
print '---------------------------------'
print X_blend_testcsv
print '---------------------------------'

12 12 12
<type 'list'> <type 'list'> <type 'list'>
[array([3, 5, 2, ..., 1, 2, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 5, 1, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 2, 1, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 2, 5, 7]), array([3, 5, 2, ..., 2, 5, 7])]
[array([3, 6, 7, ..., 5, 4, 4]), array([6, 6, 1, ..., 5, 4, 4]), array([3, 6, 1, ..., 5, 4, 4]), array([4, 6, 7, ..., 5, 4, 4]), array([4, 6, 1, ..., 5, 4, 4]), array([4, 6, 1, ..., 5, 4, 4]), array([6, 6, 1, ..., 5, 4, 4]), array([6, 6, 1, ..., 5, 4, 4]), array([6, 6, 1, ..., 5, 4, 4]), array([6, 3, 1, ..., 5, 4, 4]), array([6, 3, 1, ..., 5, 4, 4]), array([6, 3, 1, ..., 5, 4, 4])]
[array([2, 2, 2, ..., 3, 3, 3]), array([2, 2, 2, ..., 3, 3, 3]), array([2, 2, 2, ..., 3, 3, 3]), array([2, 2, 2, ..., 3, 3, 3]), array([2, 2, 2, ..., 3, 3, 3]), array([2, 2, 2, ..

In [None]:
XGB2= XGBClassifier(max_depth=10, n_estimators=1000).fit(X_blend_train, y_train) 

#Make a copy of the test.csv 
test_data=testcsv.copy()

#Run Predictions on test.csv 
test_data['Cover_Type']=XGB2.predict(X_blend_testcsv) 

#Create Submissions csv file 
test_data=test_data['Cover_Type'] 
test_data.to_csv('SampleSubmission20', header=True) 
print 'Complete'