In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import numpy as np
import time
import timeit
import collections
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Preprocessing + Feature Selection
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# Model Building
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# Hyperparameter tuning using Bayesian optimization
from skopt import BayesSearchCV

# Metrics
from sklearn.metrics import accuracy_score
import shap

In [2]:
final_df = pd.read_csv("final_df.csv")
p_final_df = pd.read_csv("p_final_df.csv")

In [3]:
X = final_df.drop(['virality', 'tweet_user_id', 'tweet_id', 'user_id'], axis=1)
y = final_df['virality']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=314, stratify=y)
print('Training set shape ', X_train.shape)
print('Test set shape ', X_test.shape)

Training set shape  (23700, 2944)
Test set shape  (5925, 2944)


In [4]:
clf = lgb.LGBMClassifier(objective='multiclass', metric='multi_logloss', num_class=5,)

search_space = {
                'num_leaves': (13, 31),
                'boost': ['gbdt', 'goss'],
                'learning_rate': (0.008, 0.01),
                'max_depth': (8, 16),
                'feature_fraction': (0.05, 0.10, 0.25),
               }

fit_params = {
             'eval_set': (X_test, y_test),
             'eval_metric': 'multi_logloss', 
             'early_stopping_rounds': 10,
             }

In [5]:
opt = BayesSearchCV(
    estimator = clf,
    search_spaces = search_space,
    fit_params = fit_params,
    random_state=0,
    cv=2,
    verbose=-1,
    n_jobs=16
)

In [None]:
opt.fit(X_train, y_train)
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("Best parameters: ", opt.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3

In [None]:
opt_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class':5,
    'max_depth':7,
    'num_leaves':50,
    'learning_rate': 0.05,
    'n_estimators':500,
    'min_data_in_leaf':200,
    'colsample_bytree': 0.5,
    'subsample': 0.5,
    'subsample_freq':1,
    'reg_alpha': 5,
    'max_bin': 10
}