# Import Statements

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import joblib
import optuna
import sklearn 

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# Load Data

In [None]:
# load data
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
# view data
train.head()

In [None]:
# remove ID column from set
train = train.iloc[:, 1:]
train.head()

# EDA

In [None]:
# check for missing values
train.isnull().values.any()

In [None]:
# summary
train.describe()

In [None]:
# dimensions of data set 
print(train.shape) # 55 columns
# column names
print(train.columns)

# Preprocessing

In [None]:
# create cat, num, and y
X_cat = train.iloc[:, 10:54].values
X_num = train.iloc[:, 0:10].values
y = train.iloc[:, -1].values

In [None]:
# scale/standardizing numerical columns
# scaler object
scaler = StandardScaler()
# fit to training data
scaler.fit(X_num)
# scale num columns
X_num = scaler.transform(X_num)

# shape
print(f'Categorical Shape: {X_cat.shape}')
print(f'Numerical Shape: {X_num.shape}')
print(f'Label Shape: {y.shape}')

In [None]:
# combine num and cat
X = np.hstack((X_num, X_cat))
print(X.shape)

# PCA

In [None]:
# PCA to find the number of components
pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Number of Components for Cumulative Variance')

In [None]:
# PCA
pca = PCA(n_components = 10)
pca.fit(X)

In [None]:
# print components
print(pca.components_)

# print variances
print(pca.explained_variance_)

# Logistic Regression

%%time

# optuna hyperparameter tuning
def objective(trial):
      solver = trial.suggest_categorical('solver', ['saga', 'lbfgs'])
      lr_clf = LogisticRegression(random_state = 1, penalty = 'none', max_iter = 500, solver = solver)
      return sklearn.model_selection.cross_val_score(lr_clf, X, y, n_jobs = -1, cv = 10).mean()
    
lr_study = optuna.create_study(direction='maximize')
lr_study.optimize(objective, n_trials=3)
lr = lr_study.best_trial
print('Accuracy: {}'.format(lr.value))
print("Best hyperparameters: {}".format(lr.params))

In [None]:
# best model 

lr_model = LogisticRegression(random_state = 1, 
                              penalty = 'none', 
                              max_iter = 500, 
                              solver = 'saga')
lr_model.fit(X, y)

# Decision Tree

In [None]:
%%time

# optuna hyperparameter tuning
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 2, 50)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    dt_clf = DecisionTreeClassifier(random_state = 1, max_depth = max_depth, min_samples_leaf = min_samples_leaf)
    return sklearn.model_selection.cross_val_score(dt_clf, X, y, n_jobs = -1, cv = 10).mean()
    
dt_study = optuna.create_study(direction='maximize')
dt_study.optimize(objective, n_trials=100)
dt = dt_study.best_trial
print('Accuracy: {}'.format(dt.value))
print("Best hyperparameters: {}".format(dt.params))

In [None]:
# dt best model
dt_model = DecisionTreeClassifier(random_state = 1, 
                                  max_depth = dt_study.best_trial.params['max_depth'], 
                                  min_samples_leaf = dt_study.best_trial.params['min_samples_leaf'])
dt_model.fit(X, y)

# Random Forest

In [None]:
%%time

# optuna hyperparameter tuning
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 150)
    max_depth = trial.suggest_int('max_depth', 20, 50)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    rf_clf = RandomForestClassifier(random_state = 1, n_estimators = n_estimators, max_depth = max_depth, min_samples_leaf = min_samples_leaf)
    return sklearn.model_selection.cross_val_score(rf_clf, X, y, n_jobs = -1, cv = 10).mean()
    
rf_study = optuna.create_study(direction='maximize')
rf_study.optimize(objective, n_trials=20)
rf = rf_study.best_trial
print('Accuracy: {}'.format(rf.value))
print("Best hyperparameters: {}".format(rf.params))

In [None]:
# best model
rf_model = RandomForestClassifier(random_state = 1, 
                                  n_estimators = rf_study.best_trial.params['n_estimators'], 
                                  max_depth = rf_study.best_trial.params['max_depth'], 
                                  min_samples_leaf = rf_study.best_trial.params['min_samples_leaf'])

rf_model.fit(X, y)

# Extra Tree Classifier

In [None]:
%%time

# optuna hyperparameter tuning
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 30, 50)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    tree_clf = ExtraTreesClassifier(random_state = 0, n_estimators = 200, max_depth = max_depth, min_samples_leaf = min_samples_leaf)
    return sklearn.model_selection.cross_val_score(tree_clf, X, y, n_jobs = -1, cv = 10).mean()
    
tree_study = optuna.create_study(direction='maximize')
tree_study.optimize(objective, n_trials=20)
tree = tree_study.best_trial
print('Accuracy: {}'.format(tree.value))
print("Best hyperparameters: {}".format(tree.params))

In [None]:
# best model
tree_model = ExtraTreesClassifier(random_state = 1, 
                                  n_estimators = 200, 
                                  max_depth = tree_study.best_trial.params['max_depth'], 
                                  min_samples_leaf = tree_study.best_trial.params['min_samples_leaf'])

tree_model.fit(X, y)

# Gradient Boosting Classifier

In [None]:
%%time

# optuna hyperparameter tuning
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 10, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 15, 20)
    gradb_clf = GradientBoostingClassifier(random_state = 0, max_depth = max_depth, min_samples_leaf = min_samples_leaf)
    return sklearn.model_selection.cross_val_score(gradb_clf, X, y, n_jobs = -1, cv = 10).mean()
    
gradb_study = optuna.create_study(direction='maximize')
gradb_study.optimize(objective, n_trials = 5)
gradb = gradb_study.best_trial
print('Accuracy: {}'.format(gradb.value))
print("Best hyperparameters: {}".format(gradb.params))

In [None]:
%%time
# best model
gradb_model = GradientBoostingClassifier(random_state = 0,
                                         max_depth = gradb_study.best_trial.params['max_depth'], 
                                         min_samples_leaf = gradb_study.best_trial.params['min_samples_leaf'])

gradb_model.fit(X, y)

# Extreme Gradient Boosting

In [None]:
%%time
# xgb classifier
xgb_clf = XGBClassifier(random_state = 0, max_depth = 10)
xgb_model = xgb_clf.fit(X, y)
xgb_model.score(X, y)

# AdaBoost 

In [None]:
%%time

# optuna hyperparameter tuning
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 15)
    adab_clf = AdaBoostClassifier(random_state = 0, n_estimators = n_estimators)
    return sklearn.model_selection.cross_val_score(adab_clf, X, y, n_jobs = -1, cv = 10).mean()
    
adab_study = optuna.create_study(direction='maximize')
adab_study.optimize(objective, n_trials = 10)
adab = adab_study.best_trial
print('Accuracy: {}'.format(adab.value))
print("Best hyperparameters: {}".format(adab.params))

In [None]:
# best model
adab_model = AdaBoostClassifier(random_state = 0, 
                                n_estimators = adab_study.best_trial.params['n_estimators'])

adab_model.fit(X, y)

# Model Selection

In [None]:
%%time 
# create ensemble classifier 
ensemble_model = VotingClassifier(
    estimators = [('tree', tree_model), 
                  ('rf', rf_model), 
                  ('gradb', gradb_model), 
                  ('xgb', xgb_model)],
    voting = 'hard'
)

# fit
ensemble_model.fit(X, y)

# print training accuracy
print('Logistic Regression Accuracy', lr_model.score(X, y))
print('Decision Tree Accuracy', dt_model.score(X, y))
print('Random Forest Accuracy', rf_model.score(X, y))
print('Extra Trees Accuracy', tree_model.score(X, y))
print('Gradient Boosting Accuracy', gradb_model.score(X, y))
print('Extra Gradient Boosting Accuracy', xgb_model.score(X, y))
print('AdaBoost Accuracy', adab_model.score(X, y))
print('Ensemble Accuracy:', ensemble_model.score(X, y))

# Save Preprocessor and Models

In [None]:
# save scaler
joblib.dump(scaler, 'forest_cover_scaler.joblib')

In [None]:
joblib.dump(rf_model, 'rf_model_2.joblib')
joblib.dump(tree_model, 'tree_model_2.joblib')
joblib.dump(gradb_model, 'gradb_model_2.joblib')
joblib.dump(xgb_model, 'xgb_model_2.joblib')
joblib.dump(adab_model, 'adab_model_2.joblib')
joblib.dump(ensemble_model, 'ensemble_model_2.joblib')
print('Model written to file.')