### TPS Dec 2021 - Baseline Model

- For modeling, i am using 5 Folds [data](https://www.kaggle.com/nitishraj/tps-dec21-5-folds) created by [Tps-dec-2021-5-folds](https://www.kaggle.com/nitishraj/tps-dec-2021-5-folds)

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from scipy.stats import mode

In [None]:
# Read 5 Fold Train, Test and Sample Submission Files
df_train = pd.read_csv("../input/tps-dec21-5-folds/train_folds.csv")
df_test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
df_submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [None]:
useful_features = [c for c in df_train.columns if c not in ("Id", "Cover_Type", "kfold")]
#cont_cols = [col for col in useful_features if 'Soil_Type' not in col]

df_train = df_train[df_train.Cover_Type!=5]

df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}

scores = []

for fold in range(5):
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    xtest = df_test.copy()
    
    # Store IDs of validation Dataset
    valid_ids = xvalid.Id.values.tolist()
    
    #Label encoding Y
    le = preprocessing.LabelEncoder().fit(xtrain.Cover_Type)
    
    ytrain = le.transform(xtrain.Cover_Type)
    yvalid = le.transform(xvalid.Cover_Type)
    
    #Save a copy of yvalid
    true_valid = xvalid.Cover_Type
    
    n_class = len(xtrain.Cover_Type.unique())
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    params = {'learning_rate': 0.03811822061503613, 
              'reg_lambda': 17.136779266696237, 
              'reg_alpha': 1.196532346754796e-05, 
              'subsample': 0.16103284130404089, 
              'colsample_bytree': 0.9165052246716364, 
              'max_depth': 10,
              'grow_policy': 'depthwise'}
    
    model = XGBClassifier(
        
        random_state = 42,
        tree_method='gpu_hist',
        objective = 'multi:softmax',
        sampling_method = 'gradient_based',
        n_estimators=10000,
        n_jobs=-1,
        num_class = n_class,
        use_label_encoder=False,
        eval_metric = 'mlogloss',
        **params
    )
    model.fit(xtrain, ytrain,early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = le.inverse_transform(model.predict(xvalid))
    
    test_preds = le.inverse_transform(model.predict(xtest))
    
    final_test_predictions.append(test_preds)
    
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    acc_scr = accuracy_score(true_valid, preds_valid)
    
    print(fold, acc_scr)
    
    scores.append(acc_scr)


#final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
#final_valid_predictions.columns = ["Id", "Cover_Type"]    
    
df_submission.Cover_Type = mode(np.column_stack(final_test_predictions), axis=1)[0]
df_submission.columns = ["Id", "Cover_Type"]
df_submission.to_csv("submission.csv", index=False)