In [None]:
import numpy as np
import pandas as pd 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv')
test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')

In [None]:
train['Cover_Type'].unique()

In [None]:
# train.groupby(by='Cover_Type').median()

In [None]:
X = train.drop(labels = ['Id', 'Cover_Type'], axis=1)
Y = train['Cover_Type']
x_train, x_validate, y_train, y_validate = train_test_split(X, Y, random_state = 42)

In [None]:
# KNN approach
KNN = KNeighborsClassifier(n_neighbors = 11, n_jobs = -1)
KNN.fit(x_train,y_train)
print(str(KNN.score(x_validate,y_validate)))

KNN.fit(X, Y)
predictions = KNN.predict(test.drop(labels = ['Id'], axis=1))
submission=pd.DataFrame(data=predictions, columns=['Cover_Type'])
submission['Id'] = test['Id']
submission.set_index('Id',inplace=True)
submission.to_csv('submission.csv')

In [None]:
def get_maxdepth(maxdepth):
    model = RandomForestClassifier(max_depth = maxdepth, random_state = 42)
    model.fit(x_train, y_train)
    preds = model.predict(x_validate)
    return accuracy_score(y_validate, preds)
def get_maxleafnodes(maxleafnodes):
    model = RandomForestClassifier(max_leaf_nodes = maxleafnodes, random_state = 42)
    model.fit(x_train, y_train)
    preds = model.predict(x_validate)
    return accuracy_score(y_validate, preds)
def get_nestimator(nestimator):
    model = RandomForestClassifier(n_estimators = nestimator, random_state = 42)
    model.fit(x_train, y_train)
    preds = model.predict(x_validate)
    return accuracy_score(y_validate, preds)

In [None]:
maxdepth_candidates = [38, 39, 40, 41, 42]
maxdepth_res = []
for candidate in maxdepth_candidates:
    acc = get_maxdepth(candidate)
    print(str(candidate), ':', str(acc))
    maxdepth_res.append(acc)
maxdepth = maxdepth_candidates[maxdepth_res.index(max(maxdepth_res))]
print('max depth:', maxdepth)
print("-" * 50)
    
maxleaf_candidates = [2020, 2030, 2040, 2050]
maxleaf_res = []
for candidate in maxleaf_candidates:
    acc = get_maxleafnodes(candidate)
    print(str(candidate), ':', str(acc))
    maxleaf_res.append(acc)
maxleaf = maxleaf_candidates[maxleaf_res.index(max(maxleaf_res))]
print('max leaf:', maxleaf)
print("-" * 50 )

nestimator_candidates = [172, 173, 174, 175, 176, 177, 178]
nestimator_res = []
for candidate in nestimator_candidates:
    acc = get_nestimator(candidate)
    print(str(candidate), ':', str(acc))
    nestimator_res.append(acc)
nestimator = nestimator_candidates[nestimator_res.index(max(nestimator_res))]
print('n estimator:', nestimator)
print("-" * 50)

In [None]:
def get_maxleafnodes_nestimator(nestimator, maxleafnodes):
    model = RandomForestClassifier(n_estimators = nestimator, max_leaf_nodes = maxleafnodes, random_state = 42)
    model.fit(x_train, y_train)
    preds = model.predict(x_validate)
    return accuracy_score(y_validate, preds)

maxleaf_candidates = [2000, 2010, 2030, 2040, 2050, 2060, 2070] # run and get 2060, then use 2060 as base
# maxleaf_candidates = [2052, 2054, 2056, 2058, 2060, 2062, 2064, 2066, 2068]
maxleaf_res = []
for candidate in maxleaf_candidates:
    acc = get_maxleafnodes_nestimator(nestimator, candidate)
    print(str(candidate), ':', str(acc))
    maxleaf_res.append(acc)
maxleaf = maxleaf_candidates[maxleaf_res.index(max(maxleaf_res))]
print('max leaf:', maxleaf)
print("-" * 50 )

In [None]:
def get_maxdepth_nestimator_maxleaf(nestimator, maxleafnodes, maxdepth):
    model = RandomForestClassifier(n_estimators = nestimator, max_leaf_nodes = maxleafnodes, max_depth = maxdepth, random_state = 42)
    model.fit(x_train, y_train)
    preds = model.predict(x_validate)
    return accuracy_score(y_validate, preds)

# maxdepth_candidates = [30, 35, 40, 45, 50] #30
maxdepth_candidates = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
maxdepth_res = []
for candidate in maxdepth_candidates:
    acc = get_maxdepth_nestimator_maxleaf(nestimator, maxleaf, candidate)
    print(str(candidate), ':', str(acc))
    maxdepth_res.append(acc)
maxdepth = maxdepth_candidates[maxdepth_res.index(max(maxdepth_res))]
print('max depth:', maxdepth)
print('-' * 50 )

In [None]:
RFC = RandomForestClassifier(random_state = 42)
RFC.fit(x_train, y_train)
preds = RFC.predict(x_validate)
print('without param tuning:',str(accuracy_score(y_validate, preds)))

RFC = RandomForestClassifier(n_estimators = nestimator, max_leaf_nodes = maxleaf, max_depth = maxdepth, random_state = 42)
RFC.fit(x_train, y_train)
preds = RFC.predict(x_validate)
print('with param tuning:' ,str(accuracy_score(y_validate, preds)))

In [None]:
RFC = RandomForestClassifier(n_estimators = nestimator, max_leaf_nodes = maxleaf, max_depth = maxdepth, random_state = 42)
RFC.fit(X, Y)
predictions = RFC.predict(test.drop(labels = ['Id'], axis = 1))
submission = pd.DataFrame(data=predictions, columns=['Cover_Type'])
submission['Id'] = test['Id']
submission.set_index('Id',inplace=True)
submission.to_csv('submission.csv')