# Importing Packages

In [None]:
import pandas as pd
import numpy as np
import missingno
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from xgboost import XGBClassifier,plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split,KFold, GroupKFold, StratifiedKFold
import warnings
from sklearn.metrics import log_loss
import plotly.express as px
from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")

# Importing Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")

In [None]:
train.head()

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# **EDA**

In [None]:
train.info()

1. Column names doesn't make much sense as all of columns are named by integer with prefix as feature.so from domain stand-point, cannot interpret much information from column names.
1. No missing values in the dataset
1. All the columns are of type integer

# Missing Values

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Target Distribution

In [None]:
fig = px.histogram(train, x=train['target'], color=train['target'],)
fig.update_layout(
    title_text='Target distribution', # title of plot
    xaxis_title_text='Value', # xaxis label
    yaxis_title_text='Count', # yaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    
)
fig.show()

### In the target variable, Class2 has more data points compared to the remaining labels.

# Correlation Matrix

In [None]:
rename_labels = {val:idx for idx, val in enumerate(sorted(train['target'].unique()))}
train['target'] = train['target'].map(rename_labels)

In [None]:
fig, ax = plt.subplots(figsize=(15 , 12))
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))

sns.heatmap(corr,square=True, center=0, 
            linewidth=0.2, cmap='coolwarm',
           mask=mask, ax=ax) 

ax.set_title('Feature Correlation Matrix ', loc='left')
plt.show()

In [None]:
train.describe()

1. The mean of the all the features are closer to zero.
1. There is low variance across all the features.
1. The median is mostly 0 except two columns

# Number of features Unique Values

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

# x = [f'feature_{i}' for i in range(50)]
y = sorted([len(train[f'feature_{i}'].unique()) for i in range(50)])

ax.bar(range(50), y, zorder=10)
ax.set_xticks(range(50))
ax.set_yticks(range(0, 80, 5))
ax.margins(0.02)

ax.set_title('TRAIN : # of Features Unique Values', loc='left', fontweight='bold')
ax.grid(axis='y', linestyle='--', zorder=5)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

# x = [f'feature_{i}' for i in range(50)]
y = sorted([len(test[f'feature_{i}'].unique()) for i in range(50)])

ax.bar(range(50), y, zorder=10)
ax.set_xticks(range(50))
ax.set_yticks(range(0, 80, 5))
ax.margins(0.02)

ax.set_title('TEST : # of Features Unique Values', loc='left', fontweight='bold')
ax.grid(axis='y', linestyle='--', zorder=5)
plt.show()

In [None]:
train_p = train
lic = []
for col in train_p.columns[1:-1]:
    lic.append(col)
    
def plot(col):
    plt.figure(figsize = (8,5))
    g = sns.countplot(x = col, hue = 'target', data = train_p)
    plt.legend(loc='upper right')
    plt.title("Distribution of "+ col,fontsize=15)
    plt.show();

for col in lic:
    plot(col)

# Outliers

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=train, orient="h");

In [None]:
plt.figure(figsize=(18,25))
sns.boxplot(data=test.iloc[:,1:], orient="h");

In [None]:
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

In [None]:
train.columns

In [None]:
cols = list(train.columns)
cols.remove("target")

# Feature Importance using XGBoost default Parameters

In [None]:
model = XGBClassifier(tree_method = 'gpu_hist' ,
                      use_label_encoder=False)
model.fit(train.drop(columns='target'),train.target)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
plot_importance(model,
                height=0.5,
               max_num_features=None,
               title='Feature importance',
                xlabel='F score', 
                ylabel='Features',
               ax=ax)

# XGBoost

xgb_params = {
     "seed":42,
    "n_estimators":10000,
    "verbosity":1,
    "eval_metric":"mlogloss",
    "alpha":7.105038963844129,
    "colsample_bytree":0.25505629740052566,
    "gamma":0.4999381950212869,
    "reg_lambda":1.7256912198205319,
    "learning_rate":0.011823142071967673,
    "max_bin":338,
    "max_depth":8,
    "min_child_weight":2.286836198630466,
    "subsample":0.618417952155855,
    'tree_method':'gpu_hist',
    'gpu_id':0
}
xgb_params['interaction_constraints'] = '[[38, 14], [34, 14, 31], [15, 19]]'

test_preds = None
train_rmse = 0
val_rmse = 0
n_splits = 10

kf = KFold(n_splits = n_splits , shuffle = True , random_state = 0)
for fold, (tr_index , val_index) in enumerate(kf.split(train[cols].values , train['target'].values)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    x_train,x_val = train[cols].values[tr_index] , train[cols].values[val_index]
    y_train,y_val = train['target'].values[tr_index] , train['target'].values[val_index]
        
    eval_set = [(x_val, y_val)]
    
    model =XGBClassifier(**xgb_params)
    model.fit(x_train, y_train, eval_set = eval_set, verbose = 500)
    
    train_preds = model.predict(x_train)
    train_rmse += mean_squared_error(y_train ,train_preds , squared = False)
    print("Training RMSE : " , mean_squared_error(y_train ,train_preds , squared = False))
    
    val_preds = model.predict(x_val)
    val_rmse += mean_squared_error(y_val , val_preds , squared = False)
    print("Validation RMSE : " , mean_squared_error(y_val , val_preds , squared = False))
    
    if test_preds is None:
        test_preds = model.predict_proba(test[cols].values)
    else:
        test_preds += model.predict_proba(test[cols].values)

print("-" * 50)
print("Average Training RMSE : " , train_rmse / n_splits)
print("Average Validation RMSE : " , val_rmse / n_splits)

test_preds /= n_splits

submission = pd.read_csv("../input/tabular-playground-series-may-2021/sample_submission.csv")
submission['Class_1']=test_preds[:,0]
submission['Class_2']=test_preds[:,1]
submission['Class_3']=test_preds[:,2]
submission['Class_4']=test_preds[:,3]
submission.head()

submission.to_csv("XGB.csv",index=False)

# CatBoost

In [None]:
test_preds = None
train_rmse = 0
val_rmse = 0
n_splits = 10

kf = KFold(n_splits = n_splits , shuffle = True , random_state = 0)
for fold, (tr_index , val_index) in enumerate(kf.split(train[cols].values , train['target'].values)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    x_train,x_val = train[cols].values[tr_index] , train[cols].values[val_index]
    y_train,y_val = train['target'].values[tr_index] , train['target'].values[val_index]
        
    eval_set = [(x_val, y_val)]
    
    model = CatBoostClassifier(depth=4,
                               task_type="GPU",
            max_ctr_complexity=15,
            iterations=17000,
            od_wait=1000, od_type='Iter',
            learning_rate=0.01,
            min_data_in_leaf=1,
            use_best_model=True,
            loss_function='MultiClass')
    model.fit(x_train, y_train, eval_set = eval_set, verbose = 500)
    
    train_preds = model.predict(x_train)
    train_rmse += mean_squared_error(y_train ,train_preds , squared = False)
    print("Training RMSE : " , mean_squared_error(y_train ,train_preds , squared = False))
    
    val_preds = model.predict(x_val)
    val_rmse += mean_squared_error(y_val , val_preds , squared = False)
    print("Validation RMSE : " , mean_squared_error(y_val , val_preds , squared = False))
    
    if test_preds is None:
        test_preds = model.predict_proba(test[cols].values)
    else:
        test_preds += model.predict_proba(test[cols].values)

print("-" * 50)
print("Average Training RMSE : " , train_rmse / n_splits)
print("Average Validation RMSE : " , val_rmse / n_splits)

test_preds /= n_splits

In [None]:
test_preds = np.clip(test_preds, 0.08, 0.95)
submission1 = pd.read_csv("../input/tabular-playground-series-may-2021/sample_submission.csv")
submission1['Class_1']=test_preds[:,0]
submission1['Class_2']=test_preds[:,1]
submission1['Class_3']=test_preds[:,2]
submission1['Class_4']=test_preds[:,3]
submission1.head()

In [None]:
submission1.to_csv("CB.csv",index=False)

# LightAutoML

In [None]:
pip install -U lightautoml

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
train_data[TARGET_NAME] = train_data[TARGET_NAME].str.slice(start=6).astype(int) - 1

In [None]:
def create_gr_feats(data):
    pass
    

all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
create_gr_feats(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)

In [None]:
%%time

task = Task('multiclass',)

In [None]:
%%time

roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}

In [None]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS},
                               verbose=0,
                               configs_list=[
                                   '../input/lightautoml-configs/conf_0_sel_type_0.yml',
                                   '../input/lightautoml-configs/conf_1_sel_type_1.yml'
                               ])
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:5], oof_pred.shape))

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast', silent = False)
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

## Predict for test data and check OOF score

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred.data)))

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')
submission.iloc[:, 1:] = test_pred.data
submission.to_csv('lightautoml.csv', index = False)

In [None]:
def generate(main, support, coeff):
    
    g = main.copy()    
    for i in main.columns[1:]:
        
        res = []
        lm, Is = [], []        
        lm = main[i].tolist()
        ls = support[i].tolist()  
        
        for j in range(len(main)):
            res.append((lm[j] * coeff) + (ls[j] * (1.- coeff)))            
        g[i] = res
        
    return g

sub = generate(submission, submission1, 0.60)
display(sub)

In [None]:
sub.to_csv("Hybrid.csv",index=False)