In [None]:
import numpy as np
import pandas as pd 
#import datatable as dt

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:
df_train = import_data('../input/tabular-playground-series-dec-2021/train.csv')
df_test = import_data('../input/tabular-playground-series-dec-2021/test.csv')

In [None]:
df_train.head()

In [None]:
df_train.isnull().sum().sum()

In [None]:
df_train = df_train.set_index('Id')
df_train.head()

In [None]:
df_train.isnull().sum().sum()

In [None]:
df_train['Cover_Type'].unique()

In [None]:
df_train['Cover_Type'].value_counts()

## Droping the label 5

In [None]:
df_train[df_train['Cover_Type'] == 5]

In [None]:
df_train = df_train[df_train['Cover_Type'] != 5]
df_train

In [None]:
X = df_train.drop('Cover_Type', axis=1)
X.head()

In [None]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

In [None]:
X.dtypes

In [None]:
y = df_train['Cover_Type']
y

## base model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
from catboost import CatBoostClassifier
from catboost import CatBoostClassifier as lgb

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y)

In [None]:
y_train.unique()

In [None]:
model = CatBoostClassifier(
    iterations=50,
    random_seed=43,
    loss_function='MultiClass'#, 
    #timeout=600
)

model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
accuracy_score(y_test, y_pred)

## Optuna

In [None]:
import optuna

In [None]:
def objective(trial):
    
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.20, stratify = y)
     
    params = {
            'iterations':trial.suggest_int("iterations", 4000, 25000),
            'od_wait':trial.suggest_int('od_wait', 500, 2300),
            'loss_function':'MultiClass',
            'task_type':"GPU",
            'eval_metric':'MultiClass',
            'leaf_estimation_method':'Newton',
            "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli"]),
            'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
            'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
            #'subsample': trial.suggest_uniform('subsample',0,1),
            'random_strength': trial.suggest_uniform('random_strength',10,50),
            'depth': trial.suggest_int('depth',1,15),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
            'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
            }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = CatBoostClassifier(**params)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")

study.optimize(objective, n_trials=50, timeout=600) 

In [None]:
print("Number of completed trials: {}".format(len(study.trials)))
print("Best trial:")

trial = study.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")

In [None]:
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
print(study.best_params)
print(study.best_value)
print(study.best_trial)

In [None]:
study.best_params

In [None]:
model = CatBoostClassifier(**study.best_params,loss_function ='MultiClass',
              eval_metric = 'MultiClass',
              leaf_estimation_method = 'Newton', task_type="GPU")


model.fit(X_train, y_train,
    eval_set=(X_test, y_test),
   verbose=0, early_stopping_rounds=100)

In [None]:
df_test.head()

In [None]:
df_test = df_test.set_index('Id')
df_test.head()

In [None]:
y_pred = model.predict(df_test)
y_pred

In [None]:
df_sapsub = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
df_sapsub

In [None]:
df_sapsub['Cover_Type'] = y_pred
df_sapsub.to_csv('final.csv', index = False)