# Forest Cover Type

## Load Data & Setup

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')
test.head()

In [None]:
test.shape

# Feature Engineering

In [None]:
test['Euclidian_Distance_To_Hydrology'] = (test['Horizontal_Distance_To_Hydrology']**2 + test['Vertical_Distance_To_Hydrology']**2)**0.5
test['Mean_Elevation_Vertical_Distance_Hydrology'] = (test['Elevation'] + test['Vertical_Distance_To_Hydrology'])/2
test['Mean_Distance_Hydrology_Firepoints'] = (test['Horizontal_Distance_To_Hydrology'] + test['Horizontal_Distance_To_Fire_Points'])/2
test['Mean_Distance_Hydrology_Roadways'] = (test['Horizontal_Distance_To_Hydrology'] + test['Horizontal_Distance_To_Roadways'])/2
test['Mean_Distance_Firepoints_Roadways'] = (test['Horizontal_Distance_To_Fire_Points'] + test['Horizontal_Distance_To_Roadways'])/2

test

In [None]:
train['Euclidian_Distance_To_Hydrology'] = (train['Horizontal_Distance_To_Hydrology']**2 + train['Vertical_Distance_To_Hydrology']**2)**0.5
train['Mean_Elevation_Vertical_Distance_Hydrology'] = (train['Elevation'] + train['Vertical_Distance_To_Hydrology'])/2
train['Mean_Distance_Hydrology_Firepoints'] = (train['Horizontal_Distance_To_Hydrology'] + train['Horizontal_Distance_To_Fire_Points'])/2
train['Mean_Distance_Hydrology_Roadways'] = (train['Horizontal_Distance_To_Hydrology'] + train['Horizontal_Distance_To_Roadways'])/2
train['Mean_Distance_Firepoints_Roadways'] = (train['Horizontal_Distance_To_Fire_Points'] + train['Horizontal_Distance_To_Roadways'])/2

train

In [None]:
# convert float64 to int64
#train['Euclidian_Distance_To_Hydrology'] = train['Euclidian_Distance_To_Hydrology'].astype(np.int64)
#train['Mean_Elevation_Vertical_Distance_Hydrology'] = train['Mean_Elevation_Vertical_Distance_Hydrology'].astype(np.int64)
#train['Mean_Distance_Hydrology_Firepoints'] = train['Mean_Distance_Hydrology_Firepoints'].astype(np.int64)
#train['Mean_Distance_Hydrology_Roadways'] = train['Mean_Distance_Hydrology_Roadways'].astype(np.int64)

In [None]:
# convert float64 to int64
#test['Euclidian_Distance_To_Hydrology'] = test['Euclidian_Distance_To_Hydrology'].astype(np.int64)
#test['Mean_Elevation_Vertical_Distance_Hydrology'] = test['Mean_Elevation_Vertical_Distance_Hydrology'].astype(np.int64)
#test['Mean_Distance_Hydrology_Firepoints'] = test['Mean_Distance_Hydrology_Firepoints'].astype(np.int64)
#test['Mean_Distance_Hydrology_Roadways'] = test['Mean_Distance_Hydrology_Roadways'].astype(np.int64)

# Preprocessing

In [None]:
# create cat, num and y
# create categorical features
X_cat = test.iloc[:,11:55].values

# numerical features
X_num = test.iloc[:, np.r_[0:11, 55:60]].values

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# scale/standardize numerical columns
scaler = StandardScaler() # scaler object
scaler.fit(X_num) # fit training data
X_num = scaler.transform(X_num) # scale num columns

# shape
print(f'Categorical Shape: {X_cat.shape}')
print(f'Numerical Shape: {X_num.shape}')
# print(f'Label Shape: {y.shape}')

In [None]:
# combine num and cat
X = np.hstack((X_num, X_cat))
print(X.shape)

# Model Evaluation

In [None]:
# top features found in training
features = [
    'Elevation','Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology','Aspect','Slope','Euclidian_Distance_To_Hydrology',
    'Mean_Elevation_Vertical_Distance_Hydrology','Mean_Distance_Hydrology_Firepoints',
    'Mean_Distance_Hydrology_Roadways','Mean_Distance_Firepoints_Roadways'
]

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

y = train["Cover_Type"]

X = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

model = ExtraTreesClassifier(n_estimators=1000, random_state=42, max_features='log2')
model.fit(X,y)
predictions = model.predict(X_test)

output = pd.DataFrame({'Id': test.Id, 'Cover_Type': predictions})
output.to_csv('forest_cover_submission.csv', index=False)
print("Your submission was successfully saved!")