In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Dataset:  covtype.csv

Source: Remote Sensing and GIS Program, Department of Forest Sciences, College of Natural Resources, Colorado State University

Description: Predicting forest cover type from cartographic variables only (no remotely sensed data). The actual forest cover type for a given observation (30 x 30 meter cell) was determined from US Forest Service (USFS) Region 2 Resource Information System (RIS) data. Independent variables were derived from data originally obtained from US Geological Survey (USGS) and USFS data. Data is in raw form (not scaled) and contains binary (0 or 1) columns of data for qualitative independent variables (wilderness areas and soil types).

This study area includes four wilderness areas located in the Roosevelt National Forest of northern Colorado. These areas represent forests with minimal human-caused disturbances, so that existing forest cover types are more a result of ecological processes rather than forest management practices.

Variables/Columns

- Elevation: Elevation in meters
- Aspect: Aspect in degrees azimuth
- Slope: Slope in degrees
- Horizontal_Distance_To_Hydrology: Horz Dist to nearest surface water features
- Vertical_Distance_To_Hydrology: Vert Dist to nearest surface water features
- Horizontal_Distance_To_Roadways: Horz Dist to nearest roadway
- Hillshade_9am: Hillshade index at 9am, summer solstice
- Hillshade_Noon: Hillshade index at noon, summer soltice
- Hillshade_3pm: Hillshade index at 3pm, summer solstice
- Horizontal_Distance_To_Fire_Points: Horz Dist to nearest wildfire ignition points
- Wilderness_Area: 0 (absence) or 1 (presence)
- Cover_Type: (2 types) Forest Cover Type designation
    - 1: Spruce/Fir
    - 2: Lodgepole Pine

In [2]:
# Read the forest cover dataset
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_2/datasets/covtype.csv')

# Split the features and target
X = df.drop('cover', axis=1)
y = df['cover']
target_names = ["Spruce/Fir", "Lodgepole Pine"]

In [3]:
# Prepare the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Import an Extremely Random Trees classifier
from sklearn.ensemble import ExtraTreesClassifier

In [5]:
# Train the ExtraTreesClassifier model
clf = ExtraTreesClassifier(random_state=1).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.9012036513450198


In [6]:
# Import Gradient Boosting classifier
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
# Train the Gradient Boosting classifier
clf = GradientBoostingClassifier(random_state=1).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.7929771650150798
Testing Score: 0.7919056466596656


In [8]:
# Import an Adaptive Boosting classifier
from sklearn.ensemble import AdaBoostClassifier

In [9]:
# Train the AdaBoostClassifier
clf = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')



Training Score: 0.7708423093494183
Testing Score: 0.7711446805073108


In [13]:
from xgboost import XGBClassifier

In [16]:
xgb = XGBClassifier()

In [23]:
y_train = y_train - 1

In [24]:
xgb.fit(X_train_scaled, y_train)

In [26]:
y_test = y_test - 1

In [27]:
print(f'Training Score: {xgb.score(X_train_scaled, y_train)}')
print(f'Testing Score: {xgb.score(X_test_scaled, y_test)}')

Training Score: 0.9125646273158121
Testing Score: 0.8611357944906697
