### Scoreboard:

- **LR**: 1.10161 (366)
- **RF**: 1.12760
- **RF**(optimized): 1.10230
- **LR**(optimized): 1.10184
- **LightGBM**(optimized): 1.08949

In [None]:
## Installs
!pip install optuna -q

In [None]:
## Imports
import joblib
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from skopt import gp_minimize
from skopt import space
from skopt.plots import plot_convergence
import xgboost as xgb
import optuna
from lightgbm import LGBMClassifier

In [None]:
## Constants
dir = '../input/tabular-playground-series-may-2021/'

In [None]:
## Read data
train = pd.read_csv(dir + 'train.csv', index_col='id')
test = pd.read_csv(dir + 'test.csv', index_col='id')
sample_submission = pd.read_csv(dir + 'sample_submission.csv')

X_train = train.drop('target', axis=1)
y_train = train.target
print(f"Train {train.shape}")
print(f"Test {test.shape}")

In [None]:
train.head()

In [None]:
## Remove duplicate rows in training data
duplicated_rows = train[train.drop('target', axis=1).duplicated()]
y_train = y_train.drop(duplicated_rows.index.tolist())
X_train = X_train.drop_duplicates(keep='first')
X_train.shape, y_train.shape

In [None]:
## Label encode target column
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_train = pd.DataFrame(y_train, columns=['target'])
y_train.head()

In [None]:
## Scale the features
min_max_scaler = preprocessing.MinMaxScaler()
transformed_X_train = min_max_scaler.fit_transform(X_train)
transformed_test = min_max_scaler.transform(test)
X_train = pd.DataFrame(transformed_X_train, columns=X_train.columns)
test = pd.DataFrame(transformed_test, columns=test.columns)

In [None]:
X_train.head()

In [None]:
test.head()

In [None]:
train = pd.concat([X_train, y_train], axis=1)
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
## Create folds
train['kfold'] = -1

# randomize the rows
train = train.sample(frac=1).reset_index(drop=True)

# fetch targets
y = train.target.values

# initiate kfold
kf = model_selection.StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=train, y=y)):
    train.loc[v_, 'kfold'] = f
    
## save to csv
train.to_csv('train_folds.csv', index=False)

In [None]:
print(train.shape)
train.head()

In [None]:
test.to_csv('test_processed.csv', index=False)

In [None]:
models = {
    'LR': LogisticRegression(max_iter=200, penalty='l2', C=10),
    'RF': RandomForestClassifier(n_jobs=-1, max_depth=12,n_estimators=118,criterion='gini',max_features=0.7859074745773753),
    'decision_tree_gini': tree.DecisionTreeClassifier(criterion='gini'),
    'decision_tree_entropy': tree.DecisionTreeClassifier(criterion='entropy'),
    'xgb': xgb.XGBClassifier(),
    'lgbm': LGBMClassifier(num_leaves=109,learning_rate=0.2,max_depth=3,min_child_samples=89)
}

param_grid_models = {
    'RF': {
        'n_estimators': [100, 200, 300, 400],
        "max_depth": [1, 3, 5, 7],
        "criterion": ['gini', 'entroyp']
    },
    'LR': {
        'max_iter': [100, 200, 300, 500, 800, 1000],
        'C': [0.001, 0.01, 0.1, 1, 10, 100]
    }
}

In [None]:
def run(fold, model):
    df_train = train[train.kfold != fold].reset_index(drop=True)
    df_valid = train[train.kfold == fold].reset_index(drop=True)
    
    xtrain = df_train.drop(['target','kfold'], axis=1).values
    ytrain = df_train.target.values
    
    xvalid = df_valid.drop(['target', 'kfold'], axis=1).values
    yvalid = df_valid.target.values
    
    clf = models[model]
    
    clf.fit(xtrain, ytrain)
    
    preds = clf.predict_proba(xvalid)
    
    loss = metrics.log_loss(yvalid, preds)
    
    score = clf.score(xvalid, yvalid)
    
    print(f"model {model}: Fold {fold} accuracy {score} loss {loss}")
    
    joblib.dump(clf, f"{model}_{fold}.bin")

In [None]:
model = 'lgbm'

In [None]:
for i in range(5):
    run(i, model)

In [None]:
# for i in range(5):
#     run(i, 'decision_tree_gini')

In [None]:
# for i in range(5):
#     run(i, 'xgb')

In [None]:
# for i in range(5):
#     run(i, 'RF')

In [None]:
# for i in range(5):
#     run(i, 'lgbm')

In [None]:
fold = 4
model = joblib.load(f"{model}_{fold}.bin")

In [None]:
preds = model.predict_proba(test)

In [None]:
predictions = pd.DataFrame(preds, columns=['Class_1','Class_2','Class_3','Class_4'])
predictions = pd.concat([sample_submission.id, predictions], axis=1)
predictions.head()

In [None]:
predictions.to_csv('submission.csv', index=False)

In [None]:
## Hyperparameter optimization (ref: https://www.youtube.com/watch?v=5nYqK-HaoKY)

## Grid Search
def run_search(model):
    xtrain = train.drop(['target','kfold'], axis=1).values
    ytrain = train.target.values
    
    clf = models[model]
    
    param_grid = param_grid_models[model]
    
    model = model_selection.GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring="accuracy",
        verbose=42,
        n_jobs=-1,
        cv=5
    )
    
    model.fit(xtrain, ytrain)
    print(model.best_score_)
    print(model.best_estimator_.get_params())

In [None]:
model = 'LR'
run_search(model)

In [None]:
model = 'RF'
run_search(model)

In [None]:
## skopt
from functools import partial

def optimize(params, param_names, model, X, y):
    model = models[model]
    params = dict(zip(param_names, params))
    model.set_params(**params)
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X, y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X[train_idx]
        ytrain = y[train_idx]
        
        xtest = X[test_idx]
        ytest = y[test_idx]
        
        model.fit(xtrain, ytrain)
        score = model.score(xtest, ytest)
        accuracies.append(score)
    return -1 * np.mean(accuracies)    

model = 'RF'
param_space = [
    space.Integer(3, 15, name="max_depth"),
    space.Integer(100, 600, name="n_estimators"),
    space.Categorical(["gini", "entropy"], name='criterion'),
    space.Real(0.01, 1, name="max_features", prior="uniform")
]
param_names = [
    "max_depth",
    "n_estimators",
    "criterion",
    "max_features"
]
train = pd.read_csv(dir + 'train.csv', index_col='id')
test = pd.read_csv(dir + 'test.csv', index_col='id')
X = train.drop('target', axis=1).values
y = train.target.values
optimize_func = partial(
    optimize,
    param_names=param_names,
    model=model,
    X=X,
    y=y
)
result = gp_minimize(optimize_func, dimensions=param_space, n_calls=15,verbose=42)
print(dict(zip(param_names, result.x)))

In [None]:
## Plot the convergence
plot_convergence(result)

In [None]:
## Optuna
train = pd.read_csv(dir + 'train.csv', index_col='id')
test = pd.read_csv(dir + 'test.csv', index_col='id')
X = train.drop('target', axis=1).values
y = train.target.values
model='lgbm'

def objective(trial, data=X, target=y, model=model):
    X_train, X_test, y_train, y_test = model_selection.train_test_split(data, target, test_size=0.2, random_state=42, shuffle=True)
    params = {
        'num_leaves' : trial.suggest_int('num_leaves' , 109 , 109),
        'learning_rate' : trial.suggest_float('learning_rate' , 0.2 , 0.2),
        'max_depth' : trial.suggest_int('max_depth' , 3 , 3),
        'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 100)
    }
#     model = models[model]
    model = LGBMClassifier(**params) 
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='multi_logloss',
        early_stopping_rounds=100,
        verbose=42
    )
    preds = model.predict_proba(X_test)
    return metrics.log_loss(y_test, preds)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=35)
print(f"trails finished {len(study.trials)}")
print(f"Best trail: {study.best_trial.params}")