In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
import keras
from keras.layers import *
%matplotlib inline

In [None]:
df = pd.read_csv("../input/train.csv")
testset = pd.read_csv("../input/test.csv")

In [None]:
df.head()

In [None]:
testset.head()

Save the ID column of the testset, which is used at submission.

In [None]:
ids = testset['Id']

Remove Id column as it has no significance.

In [None]:
df = df.drop('Id',axis = 1)
testset = testset.drop('Id',axis = 1)

In [None]:
df.info()

Check for null values. Looks like we have no null values.

In [None]:
df.isnull().sum()

In [None]:
testset.isnull().sum()

Lets check if any column has no values/not significant

In [None]:
df.sum()

In [None]:
testset.sum()

So, soiltype 7 and 15 has no values in it, that means none of the 7 cover types has a soiltype of 7 and 15, so lets these two from the list, as they have no real use.

In [None]:
df = df.drop(['Soil_Type7', 'Soil_Type15'],axis =1)
testset = testset.drop(['Soil_Type7', 'Soil_Type15'],axis =1)


Soiltype 8 and 25 has only one observation each, so lets check what those are.

In [None]:
df[df['Soil_Type8'] == 1]

In [None]:
df[df['Soil_Type25'] == 1]

So, both the soiltypes 8 and 25 are for only Covertype 2. So, I think we can remove these columns also. But first lets run our model with these two columns

I want to try neural networks on this data, since we have a lot of numerical data.
We need to scale the data to (0,1) for the better results in neural networks

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))

Lets create our labels

In [None]:
labels = df.Cover_Type

Lets convert the labels to one hot format which is preferred for labels. 
There are many ways to do, i chose the one below

In [None]:
#labels = pd.get_dummies(labels)

Convert to numpy array, since NN's accept numpy arrays only

In [None]:
labels = labels.values

Lets create our features matrix

In [None]:
features = df.drop('Cover_Type',axis =1)

In [None]:
features.shape

In [None]:
testset.shape # since we dont have the covertype already.

In [None]:
features = features.values

In [None]:
testset = testset.values

Heres the snapshot of features 

In [None]:
features[0]

In [None]:
testset[0]

As we discussed, NN's work better on the data which is scaled. So lets scale our data.

In [None]:
features = scaler.fit_transform(features)

In [None]:
testset = scaler.transform(testset)

Here's how it looks after normalization/scaling.

In [None]:
features[0]

In [None]:
testset[0]

In [None]:
print(type(labels))
print(type(features))
print(type(testset))


Lets split our dataset into testset and trainset

In [None]:
labels = labels - 1

In [None]:
train_x,test_x,train_y,test_y = train_test_split(features,labels)

In [None]:
print(train_x.shape,train_y.shape,test_x.shape,test_y.shape)

In [None]:
train_y

# KERAS

model = keras.models.Sequential()
model.add(Dense(300,input_dim = 52,activation = 'relu'))
model.add(Dense(700,activation = 'relu'))
model.add(Dense(200,activation = 'relu'))
model.add(Dense(7,activation='softmax'))
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['categorical_accuracy'])
model.fit(train_x,train_y,epochs=60,shuffle=True, verbose =1)

print("The Accuracy on the sampled test set is", model.evaluate(test_x,test_y)[1])

So, lets now run the same keras model on the whole train set (on whole features) and predict the testset.

modelmain = keras.models.Sequential()
modelmain.add(Dense(300,input_dim = 52,activation = 'relu'))
modelmain.add(Dense(700,activation = 'relu'))
modelmain.add(Dense(200,activation = 'relu'))
modelmain.add(Dense(7,activation='softmax'))
modelmain.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['categorical_accuracy'])
modelmain.fit(features,labels,epochs=120,shuffle=True, verbose =0) #verbose 0 to display no logs

pred = modelmain.predict(testset)

covertype = [np.argmax(i)+1 for i in pred]

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [None]:
gbm = lgb.LGBMClassifier(objective="mutliclass",n_estimators=10000)
gbm.fit(train_x,train_y,early_stopping_rounds = 100, eval_set = [(test_x,test_y)],verbose = 300)

In [None]:
ypred1 = gbm.predict(test_x)

In [None]:
ypred1

In [None]:
accuracy_score(test_y,ypred1)

In [None]:
labels

In [None]:
gbm1 = lgb.LGBMClassifier(objective="mutliclass",n_estimators=4000)
gbm1.fit(features,labels,verbose = 1000)

In [None]:
finalval = gbm1.predict(testset)

In [None]:
covertype = finalval + 1

In [None]:
sub = pd.DataFrame({'Id':ids,'Cover_Type':covertype})

In [None]:
output = sub[['Id','Cover_Type']]

In [None]:
output.to_csv("output1.csv",index = False)

# PLEASE UPVOTE, IF YOU LIKE IT.