# Viral Tweets Prediction Challenge
Develop a machine learning model to predict the virality level of each tweet based on attributes such as tweet content, media attached to the tweet, and date/time published.

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import math 
import collections 
import time
import timeit
from datetime import datetime 
import warnings
import sys
import random

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

#building models
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import shap
warnings.simplefilter(action='ignore', category=FutureWarning)

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

In [2]:
# Function takes the minimum and the maximum of each column and changes the data type to what is optimal for the column.
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Load Data

In [1]:
%%time
#final_df = reduce_mem_usage(pd.read_csv("s3://daanmatchdatafiles/bitgrit/final_df.csv"))
#p_final_df = reduce_mem_usage(pd.read_csv("s3://daanmatchdatafiles/bitgrit/p_final_df.csv"))
final_df = reduce_mem_usage(pd.read_csv("final_df.csv"))
p_final_df = reduce_mem_usage(pd.read_csv("p_final_df.csv"))
print("Shape of train set: ", final_df.shape)
print("Shape of test set: ", p_final_df.shape)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


# Build Model

In [5]:
X = final_df.drop(['virality', 'tweet_user_id', 'tweet_id', 'user_id'], axis=1)
y = final_df['virality']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print('Training set shape ', X_train.shape)
print('Test set shape ', X_test.shape)

Training set shape  (20737, 2944)
Test set shape  (8888, 2944)


# Tune parameters

Prepare a Baysian Optimization function to find optimal parameters for Light GBM model.

In [1]:
def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'application':'binary','num_iterations':4000, 'learning_rate':0.05, 'early_stopping_round':100, 'metric':'auc'}
    params["num_leaves"] = round(num_leaves)
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = round(max_depth)
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
    return max(cv_result['auc-mean'])

In [None]:
# Set the range for each parameter¶
lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'lambda_l1': (0, 5),
                                        'lambda_l2': (0, 3),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=0)

In [None]:
lgbBO.maximize(init_points=init_round, n_iter=opt_round)

In [None]:
lgbBO.res['max']['max_params']

In [22]:
clf = lgb.LGBMClassifier()
clf.set_params(**opt_params)
clf

LGBMClassifier(learning_rate=0.003, max_bin=200, max_depth=10, num_leaves=150)

In [None]:
clf.fit(X_train, y_train)

In [None]:
# Prediction on the test dataset
y_pred = clf.predict(X_test)

# Base accuracy 66.45%
print('Accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))
print('roc_auc score: {0:0.4f}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
# Identify feature importance
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:10], color='blue')
plt.show()

# Fit model to Test data

In [None]:
X = p_final_df.drop(['tweet_user_id', 'tweet_id', 'user_id'], axis=1)

solution = clf.predict(X)
solution_df = pd.concat([p_final_df[['tweet_id']], pd.DataFrame(solution, columns = ['virality'])], axis=1)
solution_df.head()

In [None]:
#solution_df.to_csv('solution.csv', index=False)