In [None]:
import numpy as np
import pandas as pd
import joblib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

import sklearn
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingRegressor, VotingClassifier,\
GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectFromModel
import featuretools as ft
from lightgbm import LGBMClassifier

In [None]:
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')
train.head()

In [None]:
train = train.iloc[:,1:]
test = test.iloc[:,1:]

In [None]:
X_train = train.iloc[:, 0:54]
X_test = test.iloc[:, 0:54]

### Feature Engineering

In [None]:
from IPython.display import Image

def plota(a1,a2):
    fig = plt.figure(figsize=(16,8))
    sel = np.array(list(train.Cover_Type.values))
    plt.scatter(a1, a2, c=sel, s=100)
    plt.xlabel(a1.name)
    plt.ylabel(a2.name)

In [None]:
plota(train.Elevation, train.Horizontal_Distance_To_Hydrology)

In [None]:
plota(train.Elevation - 0.2 * train.Horizontal_Distance_To_Hydrology, train.Horizontal_Distance_To_Hydrology)

In [None]:
plota(train.Elevation, train.Horizontal_Distance_To_Roadways)

In [None]:
plota(train.Elevation - .05 * train.Horizontal_Distance_To_Roadways, train.Horizontal_Distance_To_Roadways)

In [None]:
plota(train.Elevation, train.Vertical_Distance_To_Hydrology)

In [None]:
plota(train.Elevation - train.Vertical_Distance_To_Hydrology, train.Vertical_Distance_To_Hydrology)

In [None]:
#X_train['Slope_Hydrology'] = np.sqrt(X_train.Vertical_Distance_To_Hydrology**2 + X_train.Horizontal_Distance_To_Hydrology**2)
#X_train.Slope_Hydrology = X_train.Slope_Hydrology.map(lambda x: 0 if np.isinf(x) else x)
#X_train['Slope_Hydrology_PCT'] = (X_train.Vertical_Distance_To_Hydrology / X_train.Horizontal_Distance_To_Hydrology)*100
#X_train.Slope_Hydrology_PCT=X_train.Slope_Hydrology_PCT.map(lambda x: 0 if np.isinf(x) else x)
#X_train.Slope_Hydrology_PCT = X_train.Slope_Hydrology_PCT.fillna(0)

X_train['Elev_to_Horizontal_Hyd']=X_train['Elevation'] - 0.2 * X_train['Horizontal_Distance_To_Hydrology'] 
X_train['Elev_to_Horizontal_Road']=X_train.Elevation - 0.05 * X_train.Horizontal_Distance_To_Roadways  
X_train['Elev_to_Verticle_Hyd']=X_train.Elevation - X_train.Vertical_Distance_To_Hydrology 

X_train['Mean_Horizontal_Dist']=(X_train.Horizontal_Distance_To_Fire_Points + X_train.Horizontal_Distance_To_Hydrology + 
                                 X_train.Horizontal_Distance_To_Roadways)/3 
X_train['Mean_Fire_Hydro']=(X_train.Horizontal_Distance_To_Fire_Points + X_train.Horizontal_Distance_To_Hydrology)/2

In [None]:
#X_test['Slope_Hydrology'] = np.sqrt(X_test.Vertical_Distance_To_Hydrology**2 + X_test.Horizontal_Distance_To_Hydrology**2)
#X_test.Slope_Hydrology = X_test.Slope_Hydrology.map(lambda x: 0 if np.isinf(x) else x)
#X_test['Slope_Hydrology_PCT'] = (X_test.Vertical_Distance_To_Hydrology / X_test.Horizontal_Distance_To_Hydrology)*100
#X_test.Slope_Hydrology_PCT=X_test.Slope_Hydrology_PCT.map(lambda x: 0 if np.isinf(x) else x)
#X_test.Slope_Hydrology_PCT = X_test.Slope_Hydrology_PCT.fillna(0)

X_test['Elev_to_Horizontal_Hyd']=X_test.Elevation - 0.2 * X_test.Horizontal_Distance_To_Hydrology 
X_test['Elev_to_Horizontal_Road']=X_test.Elevation - 0.05 * X_test.Horizontal_Distance_To_Roadways  
X_test['Elev_to_Verticle_Hyd']=X_test.Elevation - X_test.Vertical_Distance_To_Hydrology 

X_test['Mean_Horizontal_Dist']=(X_test.Horizontal_Distance_To_Fire_Points + X_test.Horizontal_Distance_To_Hydrology + 
                                 X_test.Horizontal_Distance_To_Roadways)/3 
X_test['Mean_Fire_Hydro']=(X_test.Horizontal_Distance_To_Fire_Points + X_test.Horizontal_Distance_To_Hydrology)/2

### Pre-Processing

In [None]:
X_cat = X_train.iloc[:, 10:54].values
X_cat_columns = X_train.iloc[:, 10:54].columns
X_num = X_train.iloc[:, np.r_[0:10, 54:59]].values
X_num_columns = X_train.iloc[:, np.r_[0:10, 54:59]].columns
y = train['Cover_Type'].values

scaler = StandardScaler()
scaler.fit(X_num)
X_num = scaler.transform(X_num)
X_train = pd.DataFrame(np.hstack((X_num, X_cat)), columns=list(X_num_columns) + list(X_cat_columns))
print(X_train.shape)

In [None]:
X_cat_test = X_test.iloc[:, 10:54].values
X_cat_test_columns = X_test.iloc[:, 10:54].columns
X_num_test = X_test.iloc[:, np.r_[0:10, 54:59]].values
X_num_test_columns = X_test.iloc[:, np.r_[0:10, 54:59]].columns

scaler.fit(X_num_test)
X_num_test = scaler.transform(X_num_test)
X_test = pd.DataFrame(np.hstack((X_num_test, X_cat_test)), columns=list(X_num_test_columns)+list(X_cat_test_columns))
print(X_test.shape)

In [None]:
cor_matrix = X_train.corr().abs()
print(cor_matrix)

In [None]:
simple_model = ExtraTreesClassifier()
simple_model = simple_model.fit(X_train, y)
selector = SelectFromModel(simple_model, prefit=True)
X_reduced = pd.DataFrame(selector.transform(X_train), columns = X_train.columns[(selector.get_support())])

In [None]:
%%time

# optuna hyperparameter tuning
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 30, 50)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    tree_clf = ExtraTreesClassifier(random_state = 0, n_estimators = 200, max_depth = max_depth, min_samples_leaf = min_samples_leaf)
    return sklearn.model_selection.cross_val_score(tree_clf, X_train, y, n_jobs = -1, cv = cv).mean()
    
tree_study = optuna.create_study(direction='maximize')
tree_study.optimize(objective, n_trials=20)
tree = tree_study.best_trial
print('Accuracy: {}'.format(tree.value))
print("Best hyperparameters: {}".format(tree.params))

In [None]:
tree_model = ExtraTreesClassifier(random_state = 1, 
                                  n_estimators = 200, 
                                  max_depth =  tree.params['max_depth'],
                                  min_samples_leaf = tree.params['min_samples_leaf'])


tree_model.fit(X_train, y)

In [None]:
y_pred = tree_model.predict(X_test)

In [None]:
submission = pd.read_csv('../input/forest-cover-type-prediction/sampleSubmission.csv')

In [None]:
submission['Cover_Type'] = y_pred

In [None]:
submission.to_csv('submission.csv', index = False, header = True)