In [None]:
import numpy as np
import pandas as pd
import gc

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

In [None]:
covtype = pd.read_csv("../input/forest-cover-type-dataset/covtype.csv")
covtype['Id'] = range(len(train), len(train)+len(covtype))
covtype = covtype[train.columns].set_index("Id")

In [None]:
y = train.Cover_Type.values
X = reduce_mem_usage(train.drop("Cover_Type", axis=1).set_index("Id"))
Xt = reduce_mem_usage(test.set_index("Id"))

In [None]:
use_covtype = False

if use_covtype:
    aug_X = reduce_mem_usage(covtype.loc[covtype.Cover_Type.isin([4, 5]), X.columns])
    aug_y = covtype.loc[covtype.Cover_Type.isin([4, 5]), 'Cover_Type']
else:
    aug_X = X[y==5]
    aug_y = y[y==5]

In [None]:
del([train, test, covtype])
gc.collect()

In [None]:
FOLDS = 5

cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

preds = np.zeros( (len(Xt), len(np.unique(y))) )
oof = np.zeros( (len(X), len(np.unique(y))) )

for idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    X_train = X_train.append(aug_X)
    y_train = np.concatenate([y_train, aug_y])
    
    model = lgbm.LGBMClassifier(objective="multiclass")
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
              eval_metric="multi_error", early_stopping_rounds=10)
              
    preds += model.predict_proba(Xt) / FOLDS
    oof[val_idx] = model.predict_proba(X_val)

In [None]:
oof_preds = np.argmax(oof, axis=1) + 1
accuracy = np.mean(oof_preds==y)
print(f"OOF accuracy: {accuracy:0.3f}")

In [None]:
oof = pd.DataFrame(oof, columns=[f"prob_{i}" for i in range(7)])
oof.insert(loc=0, column='Id', value=range(len(X)))
oof.to_csv("oof.csv", index=False)

In [None]:
submission.Cover_Type = np.argmax(preds, axis=1) + 1
submission.to_csv("submission.csv", index=False)