In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np              # linear algebra
import pandas as pd             # data processing, CSV file I/O (e.g. pd.read_csv)
                                
import matplotlib.pyplot as plt # data visualization
import seaborn as sns           # data visualization

# Data Preparation

---

## Data Extraction

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/test.csv")

In [None]:
sample_submission.shape

In [None]:
train.shape

In [None]:
train.info()

In [None]:
test.shape

In [None]:
test.info()

## Data Concatenation

In [None]:
data = pd.concat([train, test], sort = False)
data.shape

In [None]:
data.info()

In [None]:
data.head()

## Null Check

In [None]:
null_cols = [col for col in data.iloc[: , : -1].columns if data[col].isnull().sum() != 0]
null_cols

In [None]:
TARGET = 'Cover_Type'
FEATURES = [col for col in train.columns if col not in ['id', TARGET]]

cat_features = [col for col in FEATURES if data[col].nunique() < 25]
cont_features = [col for col in FEATURES if data[col].nunique() >= 25]

del data
print(f'Total number of features: {len(FEATURES)}')
print(f'Number of categorical features: {len(cat_features)}')
print(f'Number of continuos features: {len(cont_features)}')

plt.pie([len(cat_features), len(cont_features)], 
        labels=['Categorical', 'Continuos'],
        colors=['#76D7C4', '#F5B7B1'],
        textprops={'fontsize': 13},
        autopct='%1.1f%%')
plt.show()

In [None]:
train.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#BB0000')\
                     .bar(subset=["mean",], color='green')

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='Cover_Type', data=train, palette='icefire');

# Preprocessing

In [None]:
# predictor
X = train.drop(columns=['Id','Cover_Type','Soil_Type7','Soil_Type15'])

# target
y = train['Cover_Type']

del train

# test data 
test_df = test.drop(columns=['Id','Soil_Type7','Soil_Type15'])
del test

In [None]:
# train-test split
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

import optuna

def objective(trial):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.15, random_state = 123, shuffle = True)

    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02, 0.03, 0.05, 0.1, 0.2, 0.3]),
        'n_estimators': 1000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [0, 24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    model = XGBClassifier(**param)  
    
    model.fit(X_train, y_train, eval_set=[(X_val,y_val)], early_stopping_rounds = 20, verbose = False)
    
    y_pred = model.predict(X_val)
    
    result = accuracy_score(y_val, y_pred)
    
    return result


study = optuna.create_study(direction = 'maximize', sampler = optuna.samplers.RandomSampler(seed = 0))

study.optimize(objective, n_trials = 10)

In [None]:
param = {
        'tree_method': 'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': study.best_params['lambda'],
        'alpha': study.best_params['alpha'],
        'colsample_bytree': study.best_params['colsample_bytree'],
        'subsample': study.best_params['subsample'],
        'learning_rate': study.best_params['learning_rate'],
        'n_estimators': 500,
        'max_depth': study.best_params['max_depth'],
        'random_state': study.best_params['random_state'],
        'min_child_weight': study.best_params['min_child_weight'],
    }


model = XGBClassifier(**param)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.15, random_state = 123, shuffle = True)
del X, y

model.fit(X_train, y_train, eval_set=[(X_val,y_val)], early_stopping_rounds = 20, verbose = False)

In [None]:
# validation prediction
y_pred = model.predict(X_val)

In [None]:
# validation accuracy
from sklearn.metrics import accuracy_score
print('Accuracy Score : ',accuracy_score(y_val, y_pred))

In [None]:
# test prediction
y_pred = model.predict(test_df)

In [None]:
# submission
submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
submission['Cover_Type'] = y_pred
submission.to_csv("submission.csv",index=False)
submission.head()