# Viral Tweets Prediction Challenge
Develop a machine learning model to predict the virality level of each tweet based on attributes such as tweet content, media attached to the tweet, and date/time published.

## Import libraries

In [2]:
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import random
import timeit
import math 
import collections 
from datetime import datetime 

## Load Data

In [3]:
final_df = pd.read_csv("final_df.csv")
p_final_df = pd.read_csv("p_final_df.csv")

# Build Model

In [4]:
X = final_df.drop(['virality', 'tweet_user_id', 'tweet_id', 'user_id'], axis=1)
y = final_df['virality']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print('Training set shape ', X_train.shape)
print('Test set shape ', X_test.shape)

Training set shape  (20737, 2944)
Test set shape  (8888, 2944)


# Light GBM model

In [21]:
#setting parameters for lightgbm
params = {}
#params['boosting_type'] = 'gbdt'
#params['n_estimators'] = 100

# Model efficiency
params['num_leaves'] = 150 # Less than 2^max_depth
params['max_depth'] = 10

# Faster speed
params['max_bin'] = 200 # Smaller for faste speed. Large for better accuracy
params['learning_rate'] = 0.003 #Small

# Accuracy
# Large max_bin
# Large num_leaves

# Over-fitting
# Small max_bin
# Small num_leaves

In [22]:
clf = lgb.LGBMClassifier()
clf.set_params(**params)
clf

LGBMClassifier(learning_rate=0.003, max_bin=200, max_depth=10, num_leaves=150)

In [None]:
clf.fit(X_train, y_train)

In [4]:
#train_data = lgb.Dataset(X_train,label=y_train)

In [11]:
#training our model using light gbm
#num_round = 50
#start = datetime.now()
#clf = lgb.train(params, train_data, num_round)

#num_boost_round=20,
#valid_sets=[lgb_train, lgb_eval]
#early_stopping_rounds=5
#stop = datetime.now()

In [12]:
#Execution time of the model
#execution_time_lgbm = stop - start
#print("Exection time of LGMB model:", execution_time_lgbm)

In [None]:
# Prediction on the test dataset
y_pred = clf.predict(X_test)

print('Accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))
print('roc_auc score: {0:0.4f}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
# Identify feature importance
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:10], color='blue')
plt.show()

# Fit model to Test data

In [None]:
X = p_final_df.drop(['tweet_user_id', 'tweet_id', 'user_id'], axis=1)

solution = clf.predict(X)
solution_df = pd.concat([p_final_df[['tweet_id']], pd.DataFrame(solution, columns = ['virality'])], axis=1)
solution_df.head()