In [None]:
import pandas as pd
import numpy as np
import time as time
import lightgbm
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from tqdm.notebook import tqdm
import catboost
import xgboost

In [None]:
directory = '/kaggle/input/tabular-playground-series-jun-2021/'
train = pd.read_csv(directory + 'train.csv')
test = pd.read_csv(directory + 'test.csv')
submission = pd.read_csv(directory + 'sample_submission.csv')
train, test, submission = train.set_index('id'), test.set_index('id'), submission.set_index('id')
train.sample(3)

With the following plot, we can see that class 6 and 8 are the most frequent in the training set by a large margin. Classes 1, 4, and 5 are very infrequent. 

In [None]:
print(train['target'].value_counts())
sns.histplot(train['target'])

converting the string labels to integers and preparing the data for machine learning:

In [None]:
labels = [int(x[-1]) for x in train['target'].values]
X_train, X_valid, y_train, y_valid = train_test_split(train.drop('target', axis=1).values, labels, shuffle=True, test_size=0.3, random_state=2021)

Because there are so many features in the data, time consumption could be a factor in parameter optimization and cv. The following cell measures how long a base lightgbm model performs on all features.

In [None]:
start = time.time()
lgb = lightgbm.LGBMClassifier()
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict_proba(X_valid)
lgb_base_score = log_loss(y_valid, lgb_pred)
end = time.time()
print(f"lightgbm scored {lgb_base_score} in {end-start} seconds")

With the trained lightgbm, we can find out which features are the most important and which can be dropped. Lightgbm also allows you to plot the feature importance using seaborn:

In [None]:
fig_dims = (20, 20)
fig, ax = plt.subplots(figsize=fig_dims)
lightgbm.plot_importance(lgb, ax=ax)

In [None]:
important_features_lgb = list(lgb.feature_importances_)
features = list(train.drop('target', axis=1).columns)

The following code gets the scores and time consumption for training a lightgbm on x features.The minimum amount of features is 29, and the maximum is 74, with increments of 5 after each iteration.

In [None]:
times = []
scores = []
num_features_counts = []
for num_features in tqdm(range(29, 75, 5)):
    num_features_counts.append(num_features)
    feature_names = features.copy()
    features_copy = important_features_lgb.copy()
    important_features = []
    for num in range(num_features):
        important_feature = feature_names[features_copy.index(max(features_copy))]
        features_copy[features_copy.index(max(features_copy))] = -100
        important_features.append(important_feature)
    condensed_train = train[important_features]
    X_train, X_valid = train_test_split(condensed_train.values, shuffle=True, test_size=0.3, random_state=2021)
    start = time.time()
    lgb = lightgbm.LGBMClassifier()
    lgb.fit(X_train, y_train)
    lgb_pred = lgb.predict_proba(X_valid)
    lgb_base_score = log_loss(y_valid, lgb_pred)
    end = time.time()
    times.append(end-start)
    scores.append(lgb_base_score)

The following three plots plot the times, scores, and amount of features against one another. 

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=times, y=scores, ax=ax)
ax1.set(xlabel='times', ylabel='scores')
plt.show()

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=num_features_counts, y=scores, ax=ax)
ax1.set(xlabel='number of features', ylabel='scores')
plt.show()

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=num_features_counts, y=times, ax=ax)
ax1.set(xlabel='number of features', ylabel='times')
plt.show()

For lightgbm, a very quick model, using all the features is probably ok because it finishes quite quickly. The following code does the same but for catboost.

In [None]:
times = []
scores = []
num_features_counts = []
for num_features in tqdm(range(29, 75, 5)):
    num_features_counts.append(num_features)
    feature_names = features.copy()
    features_copy = important_features_lgb.copy()
    important_features = []
    for num in range(num_features):
        important_feature = feature_names[features_copy.index(max(features_copy))]
        features_copy[features_copy.index(max(features_copy))] = -100
        important_features.append(important_feature)
    condensed_train = train[important_features]
    X_train, X_valid = train_test_split(condensed_train.values, shuffle=True, test_size=0.3, random_state=2021)
    start = time.time()
    ctb = catboost.CatBoostClassifier()
    ctb.fit(X_train, y_train, verbose=False)
    ctb_pred = ctb.predict_proba(X_valid)
    ctb_base_score = log_loss(y_valid, ctb_pred)
    end = time.time()
    times.append(end-start)
    scores.append(ctb_base_score)

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=times, y=scores, ax=ax)
ax1.set(xlabel='times', ylabel='scores')
plt.show()

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=num_features_counts, y=scores, ax=ax)
ax1.set(xlabel='number of features', ylabel='scores')
plt.show()

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=num_features_counts, y=times, ax=ax)
ax1.set(xlabel='number of features', ylabel='times')
plt.show()

catboost is a significantly slower model and to run cv, or optuna optimization on it, using all the features could take an incredibly large amount of time. The following does the same for xgboost.

In [None]:
times = []
scores = []
num_features_counts = []
for num_features in tqdm(range(29, 75, 5)):
    num_features_counts.append(num_features)
    feature_names = features.copy()
    features_copy = important_features_lgb.copy()
    important_features = []
    for num in range(num_features):
        important_feature = feature_names[features_copy.index(max(features_copy))]
        features_copy[features_copy.index(max(features_copy))] = -100
        important_features.append(important_feature)
    condensed_train = train[important_features]
    X_train, X_valid = train_test_split(condensed_train.values, shuffle=True, test_size=0.3, random_state=2021)
    start = time.time()
    xgb = xgboost.XGBClassifier()
    xgb.fit(X_train, y_train, verbose=False)
    xgb_pred = xgb.predict_proba(X_valid)
    xgb_base_score = log_loss(y_valid, xgb_pred)
    end = time.time()
    times.append(end-start)
    scores.append(xgb_base_score)

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=times, y=scores, ax=ax)
ax1.set(xlabel='times', ylabel='scores')
plt.show()

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=num_features_counts, y=scores, ax=ax)
ax1.set(xlabel='number of features', ylabel='scores')
plt.show()

In [None]:
fig_dims = (10, 5)
fig, ax = plt.subplots(figsize=fig_dims)
ax1 = sns.lineplot(x=num_features_counts, y=times, ax=ax)
ax1.set(xlabel='number of features', ylabel='times')
plt.show()

The main conclusion I can draw from this is that for models like xgboost and catboost, the amount of features used in the dataset heavily increases time for training the model. Perhaps for optimizing the parameters of xgboost or catboost, you can use less features and once you have good parameters, use all the features. I apologize for not adding any infrence and I want to create one for infrence later.