In [None]:
import numpy as np
import pandas as pd
import joblib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

import sklearn
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingRegressor, VotingClassifier,\
GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit, train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectFromModel
import featuretools as ft
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')
# drop ID column
train = train.iloc[:,1:]
test = test.iloc[:,1:]
train.head()

In [None]:
X_cat = train.iloc[:, 10:54]
X_num = train.iloc[:, 0:10]
y = train['Cover_Type']

scaler = StandardScaler()
scaler.fit(X_num)
X_num = scaler.transform(X_num)
X_train = pd.DataFrame(np.hstack((X_num, X_cat)), columns=train.columns[0:54])
print(X_train.shape)

In [None]:
X_cat_test = test.iloc[:, 10:54]
X_num_test = test.iloc[:, 0:10]

scaler = StandardScaler()
scaler.fit(X_num_test)
X_num_test = scaler.transform(X_num_test)
X_test = pd.DataFrame(np.hstack((X_num_test, X_cat_test)), columns=test.columns[0:54])
print(X_test.shape)

In [None]:
es = ft.EntitySet(id = 'Forest_Cover')

In [None]:
es.add_dataframe(dataframe_name = 'X_numerical', dataframe =  X_train.iloc[:, 0:10], 
                         make_index = True, index = 'index')

# Run deep feature synthesis with transformation primitives
X_numerical, features = ft.dfs(entityset = es, target_dataframe_name = 'X_numerical',
                                      trans_primitives = ['add_numeric', 'multiply_numeric', 'subtract_numeric'])

X_train = pd.concat([X_numerical, X_cat], axis=1)
X_train.head()

In [None]:
es.add_dataframe(dataframe_name = 'X_numerical', dataframe =  X_test.iloc[:, 0:10], 
                         make_index = True, index = 'index')

# Run deep feature synthesis with transformation primitives
X_numerical, features = ft.dfs(entityset = es, target_dataframe_name = 'X_numerical',
                                      trans_primitives = ['add_numeric', 'multiply_numeric', 'subtract_numeric'])

X_test = pd.concat([X_numerical, X_cat], axis=1)
X_test.head()

In [None]:
X_train['Slope_Hydrology'] = np.sqrt(X_train.Vertical_Distance_To_Hydrology**2 + X_train.Horizontal_Distance_To_Hydrology**2)
X_train.Slope_Hydrology = X_train.Slope_Hydrology.map(lambda x: 0 if np.isinf(x) else x)
X_train['Slope_Hydrology_PCT'] = (X_train.Vertical_Distance_To_Hydrology / X_train.Horizontal_Distance_To_Hydrology)*100
X_train.Slope_Hydrology_PCT=X_train.Slope_Hydrology_PCT.map(lambda x: 0 if np.isinf(x) else x)
X_train.Slope_Hydrology_PCT = X_train.Slope_Hydrology_PCT.fillna(0)

X_train['Elev_to_Horizontal_Hyd']=X_train.Elevation - 0.2 * X_train.Horizontal_Distance_To_Hydrology 
X_train['Elev_to_Horizontal_Road']=X_train.Elevation - 0.05 * X_train.Horizontal_Distance_To_Roadways  
X_train['Elev_to_Verticle_Hyd']=X_train.Elevation - X_train.Vertical_Distance_To_Hydrology 

X_train['Mean_Horizontal_Dist']=(X_train.Horizontal_Distance_To_Fire_Points + X_train.Horizontal_Distance_To_Hydrology + 
                                 X_train.Horizontal_Distance_To_Roadways)/3 
X_train['Mean_Fire_Hydro']=(X_train.Horizontal_Distance_To_Fire_Points + X_train.Horizontal_Distance_To_Hydrology)/2


In [None]:
X_test['Slope_Hydrology'] = np.sqrt(X_test.Vertical_Distance_To_Hydrology**2 + X_test.Horizontal_Distance_To_Hydrology**2)
X_test.Slope_Hydrology = X_test.Slope_Hydrology.map(lambda x: 0 if np.isinf(x) else x)
X_test.Slope_Hydrology = X_test.Slope_Hydrology.fillna(0)
X_test['Slope_Hydrology_PCT'] = (X_test.Vertical_Distance_To_Hydrology / X_test.Horizontal_Distance_To_Hydrology)*100
X_test.Slope_Hydrology_PCT=X_test.Slope_Hydrology_PCT.map(lambda x: 0 if np.isinf(x) else x)
X_test.Slope_Hydrology_PCT = X_test.Slope_Hydrology_PCT.fillna(0)

X_test['Elev_to_Horizontal_Hyd']=X_test.Elevation - 0.2 * X_test.Horizontal_Distance_To_Hydrology 
X_test['Elev_to_Horizontal_Road']=X_test.Elevation - 0.05 * X_test.Horizontal_Distance_To_Roadways  
X_test['Elev_to_Verticle_Hyd']=X_test.Elevation - X_test.Vertical_Distance_To_Hydrology 

X_test['Mean_Horizontal_Dist']=(X_test.Horizontal_Distance_To_Fire_Points + X_test.Horizontal_Distance_To_Hydrology + 
                                 X_test.Horizontal_Distance_To_Roadways)/3 
X_test['Mean_Fire_Hydro']=(X_test.Horizontal_Distance_To_Fire_Points + X_test.Horizontal_Distance_To_Hydrology)/2

## Gradient Boosting

In [None]:
#evaluate lightgbm algorithm for classification
LGBM_model = LGBMClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(LGBM_model, X_train, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
LGBM_model.fit(X_train, y)

In [None]:
y_pred = LGBM_model.predict(X_test)

In [None]:
submission = pd.read_csv('../input/forest-cover-type-prediction/sampleSubmission.csv')
submission.head()

In [None]:
submission['Cover_Type'] = y_pred
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False, header = True)


feature_matrix.head()