In [10]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import numpy as np
import time
import timeit
import collections
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Preprocessing + Feature Selection
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# Model Building
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm as lgb

# Hyperparameter tuning using Bayesian optimization
from skopt import BayesSearchCV

# Metrics
from sklearn.metrics import accuracy_score
import shap

In [2]:
final_df = pd.read_csv("final_df.csv")
p_final_df = pd.read_csv("p_final_df.csv")

In [3]:
X = final_df.drop(['virality', 'tweet_user_id', 'tweet_id', 'user_id'], axis=1)
y = final_df['virality']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=314, stratify=y)
print('Training set shape ', X_train.shape)
print('Test set shape ', X_test.shape)

Training set shape  (23700, 2944)
Test set shape  (5925, 2944)


In [11]:
clf = lgb.LGBMClassifier(boosting_type='gbdt', n_jobs=-1, verbose=2)

search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'num_leaves': (2, 500),
        'max_depth': (0, 500),
        'min_child_samples': (0, 200),
        'max_bin': (100, 100000),
        'subsample': (0.01, 1.0, 'uniform'),
        'subsample_freq': (0, 10),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'min_child_weight': (0, 10),
        'subsample_for_bin': (100000, 500000),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'scale_pos_weight': (1e-6, 500, 'log-uniform'),
        'n_estimators': (10, 10000),
        }

fit_params = {
             'eval_set': (X_test, y_test),
             'eval_metric': 'multi_logloss', 
             'early_stopping_rounds': 10,
             }

param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}



In [None]:
opt.fit(X_train, y_train)
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("Best parameters: ", opt.best_params_)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




In [None]:
opt_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class':5,
    'max_depth':7,
    'num_leaves':50,
    'learning_rate': 0.05,
    'n_estimators':500,
    'min_data_in_leaf':200,
    'colsample_bytree': 0.5,
    'subsample': 0.5,
    'subsample_freq':1,
    'reg_alpha': 5,
    'max_bin': 10
}