<div style="background-color:skyblue;">
    <h1><center>Tabular Playground Series - DEC 2021</center></h1>
</div>

![](https://storage.googleapis.com/kaggle-competitions/kaggle/26480/logos/header.png?t=2021-04-09-00-57-05)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-dec-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv(r'../input/tabular-playground-series-dec-2021/test.csv')
test.head()

In [None]:
sample_submission = pd.read_csv(r'../input/tabular-playground-series-dec-2021/sample_submission.csv')
sample_submission.head()

* Reducing the size of train and test:

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
train_new = reduce_mem_usage(train)
test_new = reduce_mem_usage(test)

In [None]:
print(f'train set have {train_new.shape[0]} rows and {train_new.shape[1]} columns.')
print(f'test set have {test_new.shape[0]} rows and {test_new.shape[1]} columns.') 
print(f'sample_submission set have {sample_submission.shape[0]} rows and {sample_submission.shape[1]} columns.') 

In [None]:
# let's check for missing values
train_new.isnull().sum()

In [None]:
# let's check count of unique values every cols are having
train_new.nunique()

In [None]:
# Soil_Type7, Soil_Type15 contains only one value
train_new.drop(['Id','Soil_Type7','Soil_Type15'],axis=1,inplace=True)
test_new.drop(['Id','Soil_Type7','Soil_Type15'],axis=1,inplace=True)

In [None]:
print("Cover_Type unique values:")
print(train_new['Cover_Type'].value_counts())
plt.figure(figsize=(14,5))
target_values = train_new['Cover_Type'].value_counts()
sns.barplot(x=target_values.index, y=target_values.values,linewidth=1.5,errcolor=".2", edgecolor=".2")
plt.title("Cover_Type unique values", fontdict={'fontsize':20})
plt.show()

looks like target '5' have only one value.

In [None]:
# delete the row with target 5
train_new.drop(train_new[train_new['Cover_Type']==5].index,inplace=True)

In [None]:
y = train_new['Cover_Type']
train_new.drop('Cover_Type',axis=1,inplace=True)

In [None]:
def fit_xgb(trial, x_train, y_train, x_test, y_test):
    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 10), # Extremely prone to overfitting!
        'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400), # Extremely prone to overfitting!
        'eta': trial.suggest_float('eta', 0.007, 0.013), # Most important parameter.
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), # I've had trouble with LB score until tuning this.
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4), # L2 regularization
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4)
    } 
    
    
    model = XGBClassifier(**params,tree_method='gpu_hist', random_state=2021)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train accuracy": accuracy_score(y_train, y_train_pred),
        "valid accuracy": accuracy_score(y_test, y_test_pred)
    }
    
    return model, log

In [None]:
from sklearn.model_selection import train_test_split
def objective(trial):
    acc = 0
    x_train, x_test, y_train, y_test = train_test_split(train_new, y, test_size=0.30)
    model, log = fit_xgb(trial, x_train, y_train, x_test, y_test)
    acc += log['valid accuracy']
        
    return acc

In [None]:
xgb_params = {'max_depth': 6, 
              'n_estimators': 2000, 
              'eta': 0.0116303297452901, 
              'subsample': 0.30000000000000004, 
              'colsample_bytree': 0.6000000000000001, 
              'colsample_bylevel': 0.5, 
              'min_child_weight': 0.0007982395954513741, 
              'reg_lambda': 0.015853494080830713, 
              'reg_alpha': 0.014553261848573449, 
              'gamma': 5.036747241342804}

In [None]:
folds = KFold(n_splits = 5, random_state = 2021,shuffle=True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print(f"Fold: {fold}")
    X_train, X_test = train.iloc[trn_idx], train.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = XGBClassifier(tree_method='gpu_hist', **xgb_params)
   
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    print(f" accuracy_score: {acc}")
    print("-"*50)
    
    predictions += model.predict(test) / folds.n_splits 

In [None]:
# plot feature importance
from xgboost import plot_importance
fig, ax = plt.subplots(1,1,figsize=(20,12))
plot_importance(model,ax=ax, xlabel=None)
plt.show()

In [None]:
sample_submission['Cover_Type'] = predictions.astype('int')
sample_submission.to_csv(f'cat.csv',index = False)

In [None]:
sample_submission