In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.metrics import confusion_matrix

In [None]:
train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.Cover_Type.unique()

In [None]:
train.Cover_Type.value_counts()

In [None]:
train.describe().T

In [None]:
cont_feat = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 
             'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 
             'Horizontal_Distance_To_Fire_Points']

In [None]:
fig, axes = plt.subplots(5,2,figsize = (15,27))
for num, feat in enumerate(cont_feat):
    sns.boxplot(data = train, x = 'Cover_Type', y = feat, ax = axes[math.floor(num/2),num%2])
    plt.title(feat)

In [None]:
pd.crosstab(train.Cover_Type, train.Wilderness_Area1), pd.crosstab(train.Cover_Type, train.Wilderness_Area2), \
pd.crosstab(train.Cover_Type, train.Soil_Type1), pd.crosstab(train.Cover_Type, train.Soil_Type5)

In [None]:
ytrain = train[['Cover_Type']]
xtrain = train.drop(['Cover_Type', 'Id'], axis =1)

In [None]:

scaler = StandardScaler()
xtrain_scaled = scaler.fit_transform(xtrain[['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']])
scaled_df = pd.DataFrame(xtrain_scaled, columns = ['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'])
xtrain_scalled =pd.concat([scaled_df, xtrain[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']].reset_index()], axis = 1)

In [None]:
X_embedded = TSNE(n_components=2,learning_rate=2000, perplexity = 100).fit_transform(xtrain_scaled)
data_stack = np.hstack((X_embedded, ytrain.values))
data_stack = pd.DataFrame(data_stack, columns = ['first', 'second', 'target'])
plt.figure(figsize = (15,15))
sns.scatterplot(data = data_stack, x = 'first', y = 'second', hue = 'target', palette="deep")

# Decision Tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(xtrain, ytrain, test_size=0.3, random_state=1)

In [None]:
tree_clf = DecisionTreeClassifier()
'''params_tree = {'max_depth': [6, 10,15,20, 25, 30],
              'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'min_samples_leaf': [1,5,10,15]}'''
params_tree = {'max_depth': [30],
              'criterion': ['entropy'],
              'splitter': ['random'],
              'min_samples_leaf': [1]}
search_tree = GridSearchCV(tree_clf, params_tree, cv = 5)

In [None]:
search_tree.fit(X_train, y_train)

In [None]:
search_tree.best_score_

In [None]:
search_tree.best_params_
#{'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'splitter': 'random'}

In [None]:
best_tree = search_tree.best_estimator_

In [None]:
text_representation = tree.export_text(best_tree)
print(text_representation)

In [None]:
'''import graphviz
dot_graph = export_graphviz(best_tree, class_names=['1','2','3','4','5','6','7'],
                feature_names=X_train.columns, impurity=False, filled=True)
graphviz.Source(dot_graph)'''

In [None]:
imp = best_tree.feature_importances_
plt.figure(figsize = (15,5))
plt.bar(X_train.columns, imp)
plt.xticks(rotation=90);

In [None]:
accuracy_score(best_tree.predict(X_test), y_test)

# Random Forest

In [None]:
random_forest = RandomForestClassifier()
'''params_forest = {'n_estimators': [100,300,500,1000],
                 'max_depth': [5,7,10,15,20],
                 'min_samples_leaf': [1,5,10,15,20]}'''
params_forest = {'n_estimators': [500],
                 'max_depth': [20],
                 'min_samples_leaf': [1]}

In [None]:
grid_forest = GridSearchCV(random_forest,params_forest,cv = 5)
grid_forest.fit(X_train, y_train.values.ravel())

In [None]:
grid_forest.best_score_

In [None]:
grid_forest.best_params_
#{'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 500}

In [None]:
best_forest = grid_forest.best_estimator_

In [None]:
imp = best_forest.feature_importances_
plt.figure(figsize = (15,5))
plt.bar(X_train.columns, imp)
plt.xticks(rotation=90);

In [None]:
result_cv = []
result_std = []
for i in range(1,31):
    random_forest = RandomForestClassifier(n_estimators = 500, max_depth = i)
    res = cross_val_score(random_forest, X_train, y_train.values.ravel())
    result_cv.append(res.mean())
    result_std.append(res.std())
plt.figure(figsize = (10,5))
plt.errorbar(range(1,31), result_cv, yerr=result_std)

In [None]:
accuracy_score(best_forest.predict(X_test), y_test)

# LightGBM

In [None]:
'''lgb_train = lgb.LGBMClassifier(objective = 'multiclass')
param_lgb = {'num_leaves': [25, 40,70,100,130,160],
             'n_estimators': [100,300,700]}'''
lgb_train = lgb.LGBMClassifier(objective = 'multiclass')
param_lgb = {'num_leaves': [100],
             'n_estimators': [700]}

In [None]:
search = GridSearchCV(lgb_train, param_lgb, cv = 5)

In [None]:
search.fit(X_train, y_train.values.ravel())

In [None]:
search.best_params_
#{'num_leaves': 150, 'n_estimators': 300, 'learning_rate': 0.1}
#{'max_depth': -1, 'n_estimators': 700, 'num_leaves': 25}
#{'n_estimators': 700, 'num_leaves': 100}

In [None]:
search.best_score_

In [None]:
best_lgbm = search.best_estimator_

In [None]:
plt.figure(figsize = (15,5))
plt.bar(X_train.columns, best_lgbm.feature_importances_)
plt.xticks(rotation=90);

In [None]:
accuracy_score(best_lgbm.predict(X_test), y_test)

In [None]:
confusion_matrix(y_test, best_lgbm.predict(X_test))

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(confusion_matrix(y_test, best_lgbm.predict(X_test)), fmt='', annot=True)

In [None]:
predict = best_lgbm.predict(test.drop('Id', axis = 1))
submission = pd.DataFrame({'Id': test['Id'], 'Cover_Type': predict}).set_index('Id')
submission.to_csv('sub.csv')

In [None]:
submission.head()