# Forest Cover Type

## Load Data & Setup

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')
test.head()

In [None]:
test.shape

In [None]:
test.dtypes

## Data Exploration

In [None]:
# extract all numerical features from train
num_features = test.iloc[:,:10]

# extract all binary features from train
cat_features = test.iloc[:, 10:-1]

In [None]:
num_features.describe()

In [None]:
cat_features.describe()

## Feature Engineering
### Observation Cleaning
Update 11/16/21: This section may not be actually needed for the submission model... of course there will be no presence of certain values when predicting the cover type.

In [None]:
# split cat features
wild_data, soil_data = cat_features.iloc[:,:4], cat_features.iloc[:,4:]

In [None]:
# count for more than 1 presence
more_count = 0
# count for no presence
none_count = 0
# total count
total = 0

# loop through each row of wilderness area column
for index, row in wild_data.iterrows():
    # add the values of each col of that row
    total = row.sum(axis=0)
    
    # check for greater than 1
    if total > 1:
        more_count += 1
        total = 0
        break
        
    # check for none    
    if total == 0:
        none_count += 1
        total = 0
        
print(f'We have {more_count} observations that shows presence in more than 1 Wilderness Area.')
print(f'We have {none_count} observations that shows no presence in any Wilderness Area.')

In [None]:
# count for more than 1 presence
more_count = 0
# count for no presence
none_count = 0
# total count
total = 0

# loop through each row of soil type column
for index, row in soil_data.iterrows():
    # add the values of each col of that row
    total = row.sum(axis=0)
    
    # check for greater than 1
    if total > 1:
        more_count += 1
        total = 0
        break
        
    # check for none
    if total == 0:
        none_count += 1
        total = 0

print(f'We have {more_count} observations that shows presence in more than 1 Soil Type Area.')
print(f'We have {none_count} observations that shows no presence in any Soil Type Area.')

- It shows that we have 1 observation that is present in more than 1 Wilderness Area.
- There is 1 observation that shows presence in more than 1 Soil Type Area.
- There are 65

# Model Evaluation

In [None]:
# top 20 features found in training
features = [
    'Elevation','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points','Wilderness_Area4',
    'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Aspect','Hillshade_3pm','Hillshade_Noon',
    'Hillshade_9am','Soil_Type28','Soil_Type18','Soil_Type19','Soil_Type20','Soil_Type21','Soil_Type22',
    'Soil_Type10','Soil_Type3','Soil_Type30','Soil_Type4'
]

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

y = train["Cover_Type"]

X = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

model = ExtraTreesClassifier(n_estimators=50, random_state=53)
model.fit(X,y)
predictions = model.predict(X_test)

output = pd.DataFrame({'Id': test.Id, 'Cover_Type': predictions})
output.to_csv('forest_cover_submission.csv', index=False)
print("Your submission was successfully saved!")
