# Import dataset from Kaggle

In [1]:
! pip install -q kaggle

In [2]:
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions list

Saving kaggle.json to kaggle.json
ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
contradictory-my-dear-watson                   2030-07-01 23:59:00  Getting Started     Prizes         88           False  
gan-getting-started                            2030-07-01 23:59:00  Getting Started     Prizes        102           False  
store-sales-time-series-forecasting            2030-06-30 23:59:00  Getting Started  Knowledge        912           False  
tpu-getting-started                            2030-06-03 23:59:00  Getting Started  Knowledge        168           False  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       1870           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge    

In [3]:
! kaggle competitions download -c 'tabular-playground-series-dec-2021'

Downloading sample_submission.csv.zip to /content
  0% 0.00/2.11M [00:00<?, ?B/s]
100% 2.11M/2.11M [00:00<00:00, 51.7MB/s]
Downloading test.csv.zip to /content
 97% 24.0M/24.7M [00:00<00:00, 95.8MB/s]
100% 24.7M/24.7M [00:00<00:00, 120MB/s] 
Downloading train.csv.zip to /content
 90% 89.0M/98.9M [00:00<00:00, 104MB/s] 
100% 98.9M/98.9M [00:00<00:00, 156MB/s]


In [4]:
! unzip train.csv.zip
! unzip test.csv.zip
! unzip sample_submission.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   


#Very simple EDA

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train.isna().sum()

Id                                    0
Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0


In [4]:
train[train.duplicated(keep=False)]

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type


In [5]:
train["Aspect"][train["Aspect"] < 0] += 360
train["Aspect"][train["Aspect"] > 359] -= 360

test["Aspect"][test["Aspect"] < 0] += 360
test["Aspect"][test["Aspect"] > 359] -= 360

train.loc[train["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test.loc[test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

train.loc[train["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test.loc[test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

train.loc[train["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test.loc[test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

train.loc[train["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test.loc[test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

train.loc[train["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test.loc[test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

train.loc[train["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test.loc[test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [6]:
train['Cover_Type'].value_counts()

2    2262087
1    1468136
3     195712
7      62261
6      11426
4        377
5          1
Name: Cover_Type, dtype: int64

In [7]:
train.drop(train[train['Cover_Type'] == 5].index, inplace = True)

In [8]:
train.drop(train[train['Cover_Type'] == 4].index, inplace = True)

In [9]:
train['Cover_Type'].value_counts()

2    2262087
1    1468136
3     195712
7      62261
6      11426
Name: Cover_Type, dtype: int64

#Model (XGBoost)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [11]:
# actual train/val dataset
x = train.iloc[:, :-1]
y = train["Cover_Type"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

y_train_xgb = y_train-1
y_test_xgb = y_test - 1
y_train_xgb.value_counts()

1    1809807
0    1174210
2     156847
6      49688
5       9145
Name: Cover_Type, dtype: int64

In [12]:
# a small dataset for hyperparameter search
train_s = train.groupby("Cover_Type").apply(lambda x: x.sample(min(len(x), 10000)))

x = train_s.iloc[:, :-1]
y = train_s["Cover_Type"]
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(x, y, test_size=0.2, random_state=42)

y_train_s_xgb = y_train_s - 1
y_test_s_xgb = y_test_s - 1
y_train_s_xgb.value_counts()

6    8050
1    8007
5    8002
2    7984
0    7957
Name: Cover_Type, dtype: int64

In [12]:
# Gridsearch for n_estimator
tree_method = "gpu_hist"
model = XGBClassifier(
      objective='multi:softmax',
      n_estimators=100,
      seed=42,
      tree_method = tree_method,
      use_label_encoder=False,
    )
parameter = {'n_estimators': [10,100,200,300,500]}
search = GridSearchCV(model, parameter, n_jobs=-1, return_train_score = True)
search.fit(X_train_s, y_train_s_xgb)
search.score(X_test_s, y_test_s_xgb)

print(search.best_score_)
print(search.best_estimator_)
results = pd.DataFrame(search.cv_results_)
print(results.T)

0.9158086402594607
XGBClassifier(n_estimators=500, objective='multi:softprob', seed=42,
              tree_method='gpu_hist', use_label_encoder=False)
                                       0  ...                      4
mean_fit_time                    1.60445  ...                53.7362
std_fit_time                     0.25929  ...                7.48856
mean_score_time                0.0273662  ...              0.0404598
std_score_time                 0.0168054  ...             0.00370388
param_n_estimators                    10  ...                    500
params              {'n_estimators': 10}  ...  {'n_estimators': 500}
split0_test_score               0.810693  ...               0.911798
split1_test_score               0.804963  ...               0.915633
split2_test_score               0.808189  ...               0.917866
split3_test_score               0.812903  ...               0.914392
split4_test_score                0.81005  ...               0.919355
mean_test_score      

In [13]:
model = XGBClassifier(
      objective='multi:softmax',
      n_estimators=500,
      seed=42,
      tree_method = tree_method,
      use_label_encoder=False,
    )

parameter = {
    'max_depth': [4,6,8,10],
    'subsample':[0.5, 0.7, 0.9],
    'colsample_bytree': [0.5, 0.7, 0.9],
    }
search = RandomizedSearchCV(model, parameter, refit = "AUC", cv = 5, n_jobs=-1)
search.fit(X_train_s, y_train_s_xgb)
search.score(X_test_s, y_test_s_xgb)

print(search.best_score_)
print(search.best_estimator_)
results = pd.DataFrame(search.cv_results_)
print(results.T)

0.931416152827248
XGBClassifier(colsample_bytree=0.7, max_depth=10, n_estimators=500,
              objective='multi:softprob', seed=42, subsample=0.7,
              tree_method='gpu_hist', use_label_encoder=False)
                                                                        0  ...                                                  9
mean_fit_time                                                       63.35  ...                                            141.523
std_fit_time                                                     0.407769  ...                                            33.0218
mean_score_time                                                 0.0478336  ...                                          0.0782008
std_score_time                                                 0.00257636  ...                                         0.00289013
param_subsample                                                       0.5  ...                                                0.5
param

In [13]:
# train on actual dataset
model = XGBClassifier(
      objective='multi:softmax',
      n_estimators = 500,
      seed = 42,
      tree_method = "gpu_hist",
      use_label_encoder=False,
      subsample = 0.7,
      max_depth = 10,
      colsample_bytree = 0.7,
    )
model.fit(X_train,y_train_xgb)
model.save_model('model.json')

print(model.score(X_train,y_train_xgb))
print(model.score(X_test, y_test_xgb))

0.9816188845381297
0.9613438759883739


In [14]:
predictions = model.predict(test) + 1
submission = pd.DataFrame(test["Id"])
submission["Cover_Type"] = predictions
submission.head()

Unnamed: 0,Id,Cover_Type
0,4000000,2
1,4000001,2
2,4000002,2
3,4000003,2
4,4000004,2


In [15]:
submission.to_csv('submission.csv', index=False)
submission["Cover_Type"].value_counts()

2    513748
1    388561
3     79556
7     13475
6      4660
Name: Cover_Type, dtype: int64

In [19]:
import json
config = model.get_xgb_params()
with open("config.json", "w") as outfile:
    json.dump(config, outfile)
print(config)

{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 1, 'missing': None, 'n_estimators': 500, 'nthread': 1, 'objective': 'multi:softprob', 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': 42, 'subsample': 0.7, 'verbosity': 1, 'tree_method': 'gpu_hist', 'use_label_encoder': False}
