In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
import optuna
import plotly.express as px

from sklearn.metrics import log_loss
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("viridis", 10)
sns.set_palette(pal)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

In [None]:
train.drop(['id'],axis=1).describe().T.style.bar(subset=['mean'],color=px.colors.qualitative.G10[1]).background_gradient(subset=['std'],cmap='Blues').background_gradient(subset=['50%'],cmap='BuGn')

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train.corr(),cmap='coolwarm',vmin=-1)

## Insights
1. There is no corelation between the features even with the target variable.
2. Most of the features are skewed with 0 values even >90%, that means feature selection will be necessary.
3. Most features are left skewed 
4. Outlier Detection and removal will also be handy to improve score.
5. No corelation means that there are some unnecessary features.
6. Also we can gain some info by feature engineering by trying feature interaction or ratio and increase corelation.

# H2o AutoMl

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()

In [None]:
%%time
h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

In [None]:
h2o_train['target'] = h2o_train['target'].asfactor()

In [None]:
atml = H2OAutoML(seed=13, exclude_algos = ['DeepLearning'],
                 balance_classes=True,
                 preprocessing = ["target_encoding"] ,
                 exploitation_ratio = 0.1, 
                 sort_metric = 'logloss')

In [None]:
features = h2o_train.drop(['id','target']).columns
target = 'target'

In [None]:
%%time

atml.train(x = features, y= target, training_frame= h2o_train)

In [None]:
board = atml.leaderboard
board

In [None]:
%%time
preds = atml.leader.predict(h2o_test.drop(['id'],axis=1))

In [None]:
preds = h2o.as_list(preds)
test['target'] = preds['predict']
preds = preds.drop(['predict'],axis=1)
preds = np.clip(preds,0.05,0.95)
preds['id'] = test['id']
preds = preds[['id','Class_1','Class_2','Class_3','Class_4']]

In [None]:
preds

In [None]:
preds.to_csv('submit.csv',index=False)

# Pseudo Labeling

In [None]:
l=[]
for i in range(preds.shape[0]):
    if (preds['Class_1'][i]<0.8) & (preds['Class_2'][i]<0.8) & (preds['Class_3'][i]<0.8) & (preds['Class_4'][i]<0.8) :
        l.append(i)

In [None]:
test_conc = test.drop(l)

In [None]:
h2o_train_new = h2o.H2OFrame(pd.concat([train, test_conc]))

In [None]:
atml_2 = H2OAutoML(seed=13, exclude_algos = ['DeepLearning'],
                 balance_classes=True,
                 preprocessing = ["target_encoding"] ,
                 exploitation_ratio = 0.1, 
                 sort_metric = 'logloss')

In [None]:
%%time
atml_2.train(x=features, y = target, training_frame = h2o_train_new)

In [None]:
board_new = atml_2.leaderboard
board_new

In [None]:
%%time
preds_new = atml_2.leader.predict(h2o_test.drop(['id'],axis=1))

In [None]:
preds_new = h2o.as_list(preds_new)
preds_new = preds_new.drop(['predict'],axis=1)
preds_new = np.clip(preds_new,0.05,0.95)
preds_new['id'] = test['id']
preds_new = preds_new[['id','Class_1','Class_2','Class_3','Class_4']]

In [None]:
preds_new

In [None]:
preds_new.to_csv('submit_2.csv',index=False)