In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgb
import shap
from sklearn.model_selection import train_test_split

# Helpful function 

In [None]:
def create_missing_table(input_dataframe: pd.DataFrame):
    total = len(input_dataframe)
    naCount = input_dataframe.isnull().sum()
    zeroCount = len(input_dataframe) - input_dataframe.fillna(1).astype(bool).sum()
    zeroPercent = (zeroCount/len(input_dataframe)*100).round().map(lambda n: '{0:.1f} %'.format(n))
    naPercent = (input_dataframe.isnull().sum()/len(input_dataframe)*100).round().map(lambda n: '{0:.1f} %'.format(n))
    uniqCount = input_dataframe.nunique()
    hitRate = (input_dataframe.notnull().sum()/len(input_dataframe)*100).round().map(lambda n: '{0:.1f} %'.format(n))
    return pd.DataFrame({'count_total': total, 'count_unique': uniqCount, 'count_zero':zeroCount,'percentile_zero':zeroPercent, 'count_missing': naCount,'percentile_missing':naPercent, 'hit_rate':hitRate})

In [None]:
def describe_category(dataframe, column_name, ignore_zero=False, figsize=(11,7)):
    """
    plot describe category with percentage
    """
    if ignore_zero:
        dataframe = dataframe[dataframe[column_name] != 0]
    value_count = dataframe[column_name].value_counts().sort_index()
    df_value_count = pd.DataFrame({column_name: value_count.index, "count": value_count.values})
    sum_class = df_value_count["count"].sum()
    df_value_count["percentage"] = df_value_count["count"]/sum_class*100
    display(df_value_count)
    
    fig, ax = plt.subplots(figsize = figsize)
    ax = sns.barplot(data=df_value_count, x=column_name, y="count")
    ax.set_ylim(0, df_value_count["count"].max()*1.2)
    for p, percentage in zip(ax.patches, list(df_value_count["percentage"])):
        ax.annotate("%.2f" % percentage +" %", (p.get_x() + p.get_width() / 2., p.get_height()),
             ha='center', va='center', rotation=0, xytext=(0, 20), textcoords='offset points')  #vertical bars
    plt.show()

# Load data

In [None]:
train_path = "/kaggle/input/tabular-playground-series-may-2021/train.csv"
test_path = "/kaggle/input/tabular-playground-series-may-2021/test.csv"

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.head()

In [None]:
test_df.head()

# Training data

## 1D eda

In [None]:
train_df["target"].value_counts()

This is multi-classification problem.<br>
Each data point have 1 class only. <br>
Class are not balance.<br>

## Check zeros and missing

In [None]:
create_missing_table(train_df)

A lot of zero. Most feature have > 80% zero.


Let see value of some feature

In [None]:
train_df["feature_1"].value_counts()

In [None]:
train_df["feature_10"].value_counts()

They have very small number of unique value. Seem like all category

In [None]:
# describe_category(train_df, "feature_10", ignore_zero=True, figsize=(20,7))

## Draw distribution of feature

In [None]:
feature_list = list(train_df.columns)
feature_list.remove("id")
feature_list.remove("target")
feature_list

In [None]:
for feature_name in feature_list:
    print("=============  " + feature_name + "  ===================")
    describe_category(train_df, feature_name, ignore_zero=True, figsize=(20,7))
    print("=========================================================")

### Negative value

Feature 42, 39, 38, 35, 31, 30, 19

# 2D 

## Correlation

In [None]:
train_corr = train_df.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(train_corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(25, 20))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(train_corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

We do not see strong correlation between feature.

# Baseline Model

Let use LightGBM because it train fast and decent performance. 

In [None]:
lgbm_params = {
    'boosting': 'gbdt',
    'learning_rate': 0.01, 
    'num_leaves': 300, 
    'objective': 'multiclass',
    'num_class':4,
    'metric': 'multi_logloss',
}

## Preprocess data

In [None]:
def convert_text_to_class(str_class):
    if str_class == "Class_1":
        return 0
    elif str_class == "Class_2":
        return 1
    elif str_class == "Class_3":
        return 2
    elif str_class == "Class_4":
        return 3

In [None]:
X = train_df[feature_list]
y = train_df["target"].apply(convert_text_to_class)

In [None]:
y.value_counts()

In [None]:
# for feature_name in feature_list:
#     X[feature_name] = X[feature_name].astype(np.float32)

In [None]:
data = lgb.Dataset(X, label=y, free_raw_data=False)

## 5 Fold cross validation 

In [None]:
boost_round = 200
cv_result = lgb.cv(lgbm_params, data, num_boost_round=boost_round, early_stopping_rounds=20, nfold=5, verbose_eval=100)

In [None]:
print("CV 5 Fold result")
print("multi_logloss-mean :" ,cv_result["multi_logloss-mean"][-1])
print("multi_logloss-stdv :" ,cv_result["multi_logloss-stdv"][-1])
print(cv_result.keys())

## Shape value 

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=3041975)

In [None]:
train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data ,free_raw_data=False)

In [None]:
%%time
boost_round = 500
model = lgb.train(lgbm_params, train_data, valid_sets=[val_data], num_boost_round = boost_round, verbose_eval=100, early_stopping_rounds=50)

In [None]:
%%time
explainer = shap.TreeExplainer(model)

In [None]:
X_very_small = X.sample(500)

In [None]:
%%time
shap_values = explainer.shap_values(X_very_small)

In [None]:
shap.summary_plot(shap_values[1], X_very_small, plot_type='dot', max_display=50)

# Make submission

In [None]:
X_test = test_df[feature_list]

In [None]:
pred = model.predict(X_test)

In [None]:
y_test = pd.DataFrame(pred)

In [None]:
submission = y_test.copy()

In [None]:
submission.columns = ["Class_1", "Class_2", "Class_3", "Class_4"]

In [None]:
submission["id"] = test_df["id"]

In [None]:
# submission.columns = ["id", "Class_1", "Class_2", "Class_3", "Class_4"]

In [None]:
submission.head()

In [None]:
submission = submission[["id", "Class_1", "Class_2", "Class_3", "Class_4"]]

In [None]:
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)