In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# surpress warnings
import warnings
warnings.filterwarnings('ignore')

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style='darkgrid', color_codes=True)
plt.style.use('fivethirtyeight')
%matplotlib inline


In [None]:
# misc libraries
import random
import timeit
import math 
import collections 


# model building
import lightgbm as lgb
import sklearn.metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score,f1_score, confusion_matrix
from sklearn import preprocessing

In [None]:
# Note: this path will be different depending on where you store the dataset
tweets_path = '/kaggle/input/viral-tweets/Dataset/Tweets/'
users_path = '/kaggle/input/viral-tweets/Dataset/Users/'

# Load training datasets
train_tweets = pd.read_csv(tweets_path + 'train_tweets.csv')
train_tweets_vectorized_media = pd.read_csv(tweets_path + 'train_tweets_vectorized_media.csv')
train_tweets_vectorized_text = pd.read_csv(tweets_path + 'train_tweets_vectorized_text.csv')

# Load test dataset
test_tweets = pd.read_csv(tweets_path + 'test_tweets.csv')
test_tweets_vectorized_media = pd.read_csv(tweets_path + 'test_tweets_vectorized_media.csv')
test_tweets_vectorized_text = pd.read_csv(tweets_path + 'test_tweets_vectorized_text.csv')

# Load user dataset
users = pd.read_csv(users_path + 'users.csv')
user_vectorized_descriptions = pd.read_csv(users_path + 'user_vectorized_descriptions.csv')
user_vectorized_profile_images = pd.read_csv(users_path + 'user_vectorized_profile_images.csv')

In [None]:
print("--------Train Tweets----------------")
print("Dimension of Train Tweets : ",train_tweets.shape )
print("Dimension of Train Tweets Vectorized media : ",train_tweets_vectorized_media.shape )
print("Dimension of Train Tweets Vectorized text : ",train_tweets_vectorized_text.shape )

print("\n")

print("--------Test Tweets----------------")
print("Dimension of Test Tweets : ",test_tweets.shape )
print("Dimension of Test Tweets Vectorized media : ",test_tweets_vectorized_media.shape )
print("Dimension of Test Tweets Vectorized text : ",test_tweets_vectorized_text.shape )

print("\n")

print("--------User----------------")
print("Dimension of Users : ",users.shape )
print("Dimension of User vectorized descriptions : ",user_vectorized_descriptions.shape )
print("Dimension of User vectorized profile images : ",user_vectorized_profile_images.shape )


**Preprocessing Train Tweets Data**

In [None]:
train_tweets.head()

In [None]:
train_tweets.info()

In [None]:
# convert floats to ints
cols = ['tweet_hashtag_count', 'tweet_url_count', 'tweet_mention_count']
train_tweets[cols] = train_tweets[cols].applymap(np.int64)
train_tweets[cols].head()

In [None]:
sns.countplot(x = 'virality', data = train_tweets, palette="Set1");

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(12, 10))

sns.histplot(x = 'tweet_hashtag_count', data = train_tweets, discrete = True, ax = axs[0]);
sns.histplot(x = 'tweet_url_count', data = train_tweets, discrete = True, ax = axs[1]);
sns.histplot(x = 'tweet_mention_count', data = train_tweets, discrete = True, ax = axs[2]);

In [None]:
np.sort(train_tweets.tweet_attachment_class.unique())

In [None]:
train_tweets.isnull().sum()

In [None]:
train_tweets.fillna({'tweet_topic_ids':"['0']"}, inplace=True)

In [None]:
import seaborn as sns
plt.subplots(figsize = (20,20))
sns.heatmap(train_tweets.corr(),annot=True)

In [None]:
train_tweets.head()

In [None]:
'''temp = train_tweets.groupby(['tweet_user_id']).agg({'tweet_id':['count'],
                                      'tweet_hashtag_count' : ['mean'],
                                      'tweet_url_count': ['mean'],
                                      'tweet_mention_count': ['mean']            
                                      })
temp.columns = ['_'.join(x) for x in temp.columns]
train_tweets = pd.merge(train_tweets,temp,on=['tweet_user_id'],how='left')'''

In [None]:
train_tweets.head()

In [None]:
topic_ids = (
    train_tweets['tweet_topic_ids'].str.strip('[]').str.split('\s*,\s*').explode()
    .str.get_dummies().sum(level=0).add_prefix('topic_id_')
) 
topic_ids.rename(columns = lambda x: x.replace("'", ""), inplace=True)

In [None]:
topic_ids.head(2)

In [None]:
# year = pd.get_dummies(train_tweets.tweet_created_at_year, prefix='year')
# month = pd.get_dummies(train_tweets.tweet_created_at_month , prefix='month')
# day = pd.get_dummies(train_tweets.tweet_created_at_day, prefix='day')
hashtag = pd.get_dummies(train_tweets.tweet_hashtag_count, prefix='hashtag')
url = pd.get_dummies(train_tweets.tweet_url_count, prefix='url')
# mention = pd.get_dummies(train_tweets.tweet_mention_count , prefix='mention')
attachment = pd.get_dummies(train_tweets.tweet_attachment_class, prefix='attatchment')
# language = pd.get_dummies(train_tweets.tweet_language_id, prefix='language')

In [None]:
## Cyclical encoding
sin_hour = np.sin(2*np.pi*train_tweets['tweet_created_at_hour']/24.0)
sin_hour.name = 'sin_hour'
cos_hour = np.cos(2*np.pi*train_tweets['tweet_created_at_hour']/24.0)
cos_hour.name = 'cos_hour'

In [None]:
columns_drop = [
                "tweet_topic_ids",
                #"tweet_created_at_year",
                #"tweet_created_at_month",
                #"tweet_created_at_day",
                "tweet_hashtag_count",
                "tweet_url_count",
                #"tweet_mention_count",
                "tweet_attachment_class",
                #"tweet_language_id",
                "tweet_created_at_hour",
              ]

dfs = [
        topic_ids,
        #year,
        #month,
        #day,
        hashtag,
        url,
        #mention,
        attachment,
        #language,
        sin_hour,
        cos_hour,
      ]

train_tweets_final = train_tweets.drop(columns_drop, 1).join(dfs)

train_tweets_final.head()

In [None]:
import seaborn as sns
plt.subplots(figsize = (20,20))
sns.heatmap(train_tweets_final.corr(),annot=True)

In [None]:
df_corr = train_tweets_final.corr()['virality'][2:-1]
top_features = df_corr.sort_values(ascending=False)
top_features

**Preprocessing User Tweets Data**

In [None]:
users.head()

In [None]:
users.info()

In [None]:
users.isnull().sum()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

sns.histplot(users, x = 'user_like_count', ax = axs[0,0]);
sns.histplot(users, x = 'user_followers_count', ax = axs[0,1]);
sns.histplot(users, x = 'user_following_count', ax = axs[1,0]);
sns.histplot(users, x = 'user_listed_on_count', ax = axs[1,1]);

In [None]:
import seaborn as sns
plt.subplots(figsize = (10,10))
sns.heatmap(users.corr(),annot=True)

In [None]:
users.user_verified.unique()

In [None]:
#year = pd.get_dummies(users.user_created_at_year, prefix='year')
#month = pd.get_dummies(users.user_created_at_month , prefix='month')
user_verified = pd.get_dummies(users.user_verified, prefix='verified')

columns_drop = [
                #"user_created_at_year",
                #"user_created_at_month",
                #"user_verified"
              ]

dfs = [
        #year,
        #month,
        #user_verified
      ]

users_final = users.drop(columns_drop, 1).join(dfs)

users_final.head()

In [None]:
'''cols = ['user_like_count', 'user_followers_count', 'user_following_count','user_listed_on_count','user_tweet_count']

from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(standardize=False)

for col in cols:
    users_final[col] = pt.fit_transform(users_final[col].values.reshape(-1,1))'''

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))

sns.histplot(users_final, x = 'user_like_count', ax = axs[0,0]);
sns.histplot(users_final, x = 'user_followers_count', ax = axs[0,1]);
sns.histplot(users_final, x = 'user_following_count', ax = axs[1,0]);
sns.histplot(users_final, x = 'user_listed_on_count', ax = axs[1,1]);

In [None]:
#users_final['popularity'] = np.where((users_final['user_followers_count']-users_final['user_following_count'])>0 , 1, 0)

In [None]:
users_final.head()

In [None]:
print("Train Tweets :", train_tweets_final.shape)
print("Users :", users_final.shape)

**Train Tweet Vectorized Media**

In [None]:
# create new data frame that matches row number between train tweets and vectorized media
vectorized_media_df = pd.merge(train_tweets,train_tweets_vectorized_media, on ='tweet_id', how = 'right')
vectorized_media_df.drop(train_tweets.columns.difference(['virality']), axis=1, inplace=True)
vectorized_media_df.head()

In [None]:
# Set the target as well as dependent variables from image data.
y = vectorized_media_df['virality']
x = vectorized_media_df.loc[:, vectorized_media_df.columns.str.contains("img_")] 

# Run Lasso regression for feature selection.
sel_model = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))

# time the model fitting
start = timeit.default_timer()

# Fit the trained model on our data
sel_model.fit(x, y)

stop = timeit.default_timer()
print('Time: ', stop - start) 

# get index of good features
sel_index = sel_model.get_support()

# count the no of columns selected
counter = collections.Counter(sel_model.get_support())
counter

In [None]:
media_ind_df = pd.DataFrame(x[x.columns[(sel_index)]])
train_tweets_media_final = pd.concat([train_tweets_vectorized_media[['media_id', 'tweet_id']], media_ind_df], axis=1)
train_tweets_media_final.head()

**Train Tweet Vectorized Text**

In [None]:
train_tweets.head(2)

In [None]:
train_tweets_vectorized_text.head()

In [None]:
# create new data frame that matches row number between train tweets and vectorized media
vectorized_text_df = pd.merge(train_tweets,train_tweets_vectorized_text, on ='tweet_id', how = 'right')
vectorized_text_df.drop(train_tweets.columns.difference(['virality']), axis=1, inplace=True)
vectorized_text_df.head()

In [None]:
# Set the target as well as dependent variables from image data.
y = vectorized_text_df['virality']
x = vectorized_text_df.loc[:, vectorized_text_df.columns.str.contains("feature_")] 

# time the model fitting
start = timeit.default_timer()

# Fit the trained model on our data
sel_model.fit(x, y)

stop = timeit.default_timer()
print('Time: ', stop - start) 

# get index of good features
sel_index = sel_model.get_support()

# count the no of columns selected
counter = collections.Counter(sel_model.get_support())
counter

In [None]:
text_ind_df = pd.DataFrame(x[x.columns[(sel_index)]])
train_tweets_text_final = pd.concat([train_tweets_vectorized_text[['tweet_id']], text_ind_df], axis=1)
train_tweets_text_final.head()

**User Vectorized Descriptions**

In [None]:
train_tweets.head(2)

In [None]:
user_vectorized_descriptions.head(2)

In [None]:
average_virality_df =train_tweets.groupby('tweet_user_id').agg(pd.Series.median)['virality']

descriptions_df = pd.merge(average_virality_df, user_vectorized_descriptions, left_on ='tweet_user_id', right_on = 'user_id', how = 'right')
profile_images_df = pd.merge(average_virality_df, user_vectorized_profile_images, left_on ='tweet_user_id', right_on = 'user_id', how = 'right')
descriptions_df.head()

In [None]:
# Set the target as well as dependent variables from image data.
y = descriptions_df['virality']
x = descriptions_df.loc[:, descriptions_df.columns.str.contains("feature_")] 

# time the model fitting
start = timeit.default_timer()

# Fit the trained model on our data
sel_model.fit(x, y)

stop = timeit.default_timer()
print('Time: ', stop - start) 

# get index of good features
sel_index = sel_model.get_support()

# count the no of columns selected
counter = collections.Counter(sel_model.get_support())
counter

In [None]:
desc_ind_df = pd.DataFrame(x[x.columns[(sel_index)]])
user_descriptions_final = pd.concat([user_vectorized_descriptions[['user_id']], desc_ind_df], axis=1)
user_descriptions_final.head()

**User Vectorized Profile Images**

In [None]:
# Set the target as well as dependent variables from image data.
y = profile_images_df['virality']
x = profile_images_df.loc[:, profile_images_df.columns.str.contains("feature_")] 

# time the model fitting
start = timeit.default_timer()

# Fit the trained model on our data
sel_model.fit(x, y)

stop = timeit.default_timer()
print('Time: ', stop - start) 

# get index of good features
sel_index = sel_model.get_support()

# count the no of columns selected
counter = collections.Counter(sel_model.get_support())
counter

In [None]:
user_prof_ind_df = pd.DataFrame(x[x.columns[(sel_index)]])
user_profile_images_final = pd.concat([user_vectorized_profile_images[['user_id']], user_prof_ind_df], axis=1)
user_profile_images_final.head()

In [None]:
print(train_tweets_final.shape)
print(train_tweets_media_final.shape) # join on tweet id
print(train_tweets_text_final.shape) # join on tweet id
print(users_final.shape) # join on user_id
print(user_profile_images_final.shape) # join on user_id

In [None]:
# media_df = train_tweets_media_final.groupby('tweet_id').mean()

In [None]:
# rename columns in tweets_vectorized_text

cols = train_tweets_text_final.columns[train_tweets_text_final.columns.str.contains('feature_')]
train_tweets_text_final.rename(columns = dict(zip(cols, 'text_' + cols)), inplace=True)
train_tweets_text_final.head()

In [None]:
# Merge all tables based on the column 'user_id' for user data, and tweet_id
# for tweet data

# join tweets data
#tweet_df = pd.merge(media_df, train_tweets_text_final, on = 'tweet_id', how = 'right')
#tweet_df.fillna(0, inplace=True)

# join users data
#user_df = pd.merge(users_final, user_profile_images_final, on='user_id')

# join tweets data on train_tweets
#tweet_df_final = pd.merge(train_tweets_final, tweet_df, on = 'tweet_id')

# join that with the users data
#final_df = pd.merge(tweet_df_final, user_df, left_on = 'tweet_user_id', right_on='user_id')

#final_df.shape

**Preprocessing Test Data**

In [None]:
test_tweets.head()

In [None]:
test_tweets.isnull().sum()

In [None]:
test_tweets.fillna({'tweet_topic_ids':"['0']"}, inplace=True)

In [None]:
# convert floats to ints
cols = ['tweet_hashtag_count', 'tweet_url_count', 'tweet_mention_count']
test_tweets[cols] = test_tweets[cols].applymap(np.int64)
test_tweets[cols].head()

In [None]:
'''temp = test_tweets.groupby(['tweet_user_id']).agg({'tweet_id':['count'],
                                      'tweet_hashtag_count' : ['mean'],
                                      'tweet_url_count': ['mean'],
                                      'tweet_mention_count': ['mean']            
                                      })
temp.columns = ['_'.join(x) for x in temp.columns]
test_tweets = pd.merge(test_tweets,temp,on=['tweet_user_id'],how='left')'''

In [None]:
topic_ids = (
    test_tweets['tweet_topic_ids'].str.strip('[]').str.split('\s*,\s*').explode()
    .str.get_dummies().sum(level=0).add_prefix('topic_id_')
) 
topic_ids.rename(columns = lambda x: x.replace("'", ""), inplace=True)

#year = pd.get_dummies(test_tweets.tweet_created_at_year, prefix='year')
#month = pd.get_dummies(test_tweets.tweet_created_at_month , prefix='month')
#day = pd.get_dummies(test_tweets.tweet_created_at_day, prefix='day')
hashtag = pd.get_dummies(test_tweets.tweet_hashtag_count, prefix='hashtag')
url = pd.get_dummies(test_tweets.tweet_url_count, prefix='url')
#mention = pd.get_dummies(test_tweets.tweet_mention_count , prefix='mention')
attachment = pd.get_dummies(test_tweets.tweet_attachment_class, prefix='attatchment')
#language = pd.get_dummies(test_tweets.tweet_language_id, prefix='language')

## Cyclical encoding
sin_hour = np.sin(2*np.pi*test_tweets['tweet_created_at_hour']/24.0)
sin_hour.name = 'sin_hour'
cos_hour = np.cos(2*np.pi*test_tweets['tweet_created_at_hour']/24.0)
cos_hour.name = 'cos_hour'


columns_drop = [
                "tweet_topic_ids",
                #"tweet_created_at_year",
                #"tweet_created_at_month",
                #"tweet_created_at_day",
                "tweet_hashtag_count",
                "tweet_url_count",
                #"tweet_mention_count",
                "tweet_attachment_class",
                #"tweet_language_id",
                "tweet_created_at_hour",
              ]

dfs = [
        topic_ids,
        #year,
        #month,
        #day,
        hashtag,
        url,
        #mention,
        attachment,
        #language,
        sin_hour,
        cos_hour,
      ]

test_tweets_final = test_tweets.drop(columns_drop, 1).join(dfs)

test_tweets_final.head()

In [None]:
len(train_tweets_final.columns) - len(test_tweets_final.columns) - 1 # virality column
# train is missing 11 columns from test

In [None]:
cols_test = set(test_tweets_final.columns) - set(train_tweets_final.columns)
cols_test # train is missing these 4 columns from test

In [None]:
for col in cols_test:
    train_tweets_final[col] = 0

In [None]:
# columns missing in test from train
cols_test = set(train_tweets_final.columns) - set(test_tweets_final.columns)
cols_test.remove('virality') # remove virality from columsn to add to test
len(cols_test)

In [None]:
for col in cols_test:
    test_tweets_final[col] = 0

In [None]:
print("Train tweets Final Data : ",train_tweets_final.shape)
print("Test tweets Final Data : ",test_tweets_final.shape)

In [None]:
test_tweets_media_final = pd.concat([test_tweets_vectorized_media[['media_id', 'tweet_id']], media_ind_df], axis=1)
test_tweets_text_final = pd.concat([test_tweets_vectorized_text[['tweet_id']], text_ind_df], axis=1)

media_df = test_tweets_media_final.groupby('tweet_id').mean()

cols = test_tweets_text_final.columns[test_tweets_text_final.columns.str.contains('feature_')]
test_tweets_text_final.rename(columns = dict(zip(cols, 'text_' + cols)), inplace=True)

# join tweets data
# tweet_df = pd.merge(media_df, test_tweets_text_final, on = 'tweet_id', how = 'right')
# tweet_df.fillna(0, inplace=True)

# join users data
# user_df = pd.merge(users_final, user_profile_images_final, on='user_id')

# join tweets data on train_tweets
# tweet_df_final = pd.merge(test_tweets_final, tweet_df, on = 'tweet_id')

# join that with the users data
# p_final_df = pd.merge(tweet_df_final, user_df, left_on = 'tweet_user_id', right_on='user_id')

# p_final_df.shape

In [None]:
#final_df.to_csv("new_train.csv",index=False)
#p_final_df.to_csv("new_test.csv",index=False)

#from IPython.display import FileLink
#FileLink(r'new_train.csv')
#FileLink(r'new_test.csv')

**Data Grouping**

**Training Data**

In [None]:
train_tweets_final.head()

In [None]:
users_final.head()

In [None]:
#final_df = pd.merge(train_tweets_final, users_final, left_on = 'tweet_user_id', right_on='user_id')

**Dimesionality Reduction for Train Tweets Text Final**

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(train_tweets_text_final.drop(['tweet_id'],axis=1))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 25)

#Training data
pc = pca.fit_transform(train_tweets_text_final.drop(['tweet_id'],axis=1))
principal_train_tweets_text_final = pd.DataFrame(data = pc )
principal_train_tweets_text_final = principal_train_tweets_text_final.add_prefix('text_')

In [None]:
principal_train_tweets_text_final['tweet_id'] = train_tweets_text_final['tweet_id']

# final_df = pd.merge(final_df, principal_train_tweets_text_final, on = 'tweet_id')

In [None]:
principal_train_tweets_text_final.head()

**Dimesionality Reduction for Train Tweets Media Final**

In [None]:
pca1 = PCA().fit(train_tweets_media_final.drop(['tweet_id','media_id'],axis=1))
plt.plot(np.cumsum(pca1.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
from sklearn.decomposition import PCA

pca1 = PCA(n_components = 10)

#Training data
pc1 = pca1.fit_transform(train_tweets_media_final.drop(['tweet_id','media_id'],axis=1))
principal_train_tweets_media_final = pd.DataFrame(data = pc1)
principal_train_tweets_media_final = principal_train_tweets_media_final.add_prefix('media_')

In [None]:
principal_train_tweets_media_final['tweet_id'] = train_tweets_media_final['tweet_id']

In [None]:
media_df = principal_train_tweets_media_final.groupby('tweet_id').mean()

In [None]:
media_df.head()

**Dimesionality Reduction for User Profile Images**

In [None]:
user_profile_images_final.head()

In [None]:
from sklearn.decomposition import PCA
pca2 = PCA().fit(user_profile_images_final.drop(['user_id'],axis=1))
plt.plot(np.cumsum(pca2.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
from sklearn.decomposition import PCA

pca2 = PCA(n_components = 10)

#Training data
pc2 = pca2.fit_transform(user_profile_images_final.drop(['user_id'],axis=1))
principal_user_profile_images_final = pd.DataFrame(data = pc2)
principal_user_profile_images_final = principal_user_profile_images_final.add_prefix('userprofile_')

In [None]:
principal_user_profile_images_final['user_id'] = user_profile_images_final['user_id']

In [None]:
principal_user_profile_images_final.head()

In [None]:
#tweet_df = pd.merge(media_df, principal_train_tweets_text_final, on = 'tweet_id', how = 'right')
#tweet_df.fillna(0, inplace=True)

user_df = pd.merge(users_final, principal_user_profile_images_final, on='user_id')
tweet_df_final = pd.merge(train_tweets_final, principal_train_tweets_text_final, on = 'tweet_id')

final_df = pd.merge(tweet_df_final, user_df, left_on = 'tweet_user_id', right_on='user_id')

In [None]:
final_df.shape

**Testing Data**

In [None]:
test_tweets_final.head()

In [None]:
#p_final_df = pd.merge(test_tweets_final, users_final, left_on = 'tweet_user_id', right_on='user_id')

**Dimesionality Reduction for Test Tweets Text Final**

In [None]:
#Testing data
pc = pca.transform(test_tweets_text_final.drop(['tweet_id'],axis=1))
principal_test_tweets_text_final = pd.DataFrame(data = pc )
principal_test_tweets_text_final = principal_test_tweets_text_final.add_prefix('text_')

principal_test_tweets_text_final['tweet_id'] = test_tweets_text_final['tweet_id']

In [None]:
# p_final_df = pd.merge(p_final_df, principal_test_tweets_text_final, on = 'tweet_id')

**Dimesionality Reduction for Test Tweets media Final**

In [None]:
#Testing data
pc1 = pca1.transform(test_tweets_media_final.drop(['tweet_id','media_id'],axis=1))
principal_test_tweets_media_final = pd.DataFrame(data = pc1 )
principal_test_tweets_media_final = principal_test_tweets_media_final.add_prefix('media_')

In [None]:
principal_test_tweets_media_final.head()

In [None]:
principal_test_tweets_media_final['tweet_id'] = test_tweets_media_final['tweet_id']

media_df = principal_test_tweets_media_final.groupby('tweet_id').mean()

In [None]:
#tweet_df = pd.merge(media_df, principal_test_tweets_text_final, on = 'tweet_id', how = 'right')
#tweet_df.fillna(0, inplace=True)

tweet_df_final = pd.merge(test_tweets_final, principal_test_tweets_text_final, on = 'tweet_id')
p_final_df = pd.merge(tweet_df_final, user_df, left_on = 'tweet_user_id', right_on='user_id')

In [None]:
print(final_df.shape , p_final_df.shape)

**Building Model**

In [None]:
X = final_df.drop(['virality','tweet_id','user_id'], axis=1)
y = final_df['virality']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print('Training set shape ', X_train.shape)
print('Test set shape ', X_test.shape)

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix as cm

lr = LogisticRegression()
lr.fit(X_train,y_train)

#prediction on the test dataset
y_pred_lr = lr.predict(X_test)

accuracy = accuracy_score(y_pred_lr, y_test)
print('Logistic Regression accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred_lr)))

In [None]:
print("----------Confusion Matrix Logistic Regression---------\n")
print(cm(y_test,y_pred_lr))

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

#prediction on the test dataset
y_pred_rfc = rfc.predict(X_test)


accuracy = accuracy_score(y_pred_rfc, y_test)
print('Random Forest Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred_rfc)))

In [None]:
print("----------Confusion Matrix Random Forest---------\n")
print(cm(y_test,y_pred_rfc))

**Light GBM**

In [None]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

#prediction on the test dataset
y_pred=clf.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

In [None]:
print("----------Confusion Matrix Light GBM---------\n")
print(cm(y_test,y_pred))

In [None]:
print('Precision score: %.2f%%' % (precision_score(y_test, y_pred, average= 'weighted')*100))

In [None]:
# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:50], palette="Blues_d");

**CatBoost**

In [None]:
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

m = CatBoostClassifier(n_estimators=5000,random_state=1994,eval_metric='Accuracy',learning_rate=0.03,max_depth=5)
m.fit(X_train, y_train,eval_set=[(X_train,y_train),(X_test, y_test)],early_stopping_rounds=200,verbose=200)

print(accuracy_score(m.predict(X_test),y_test))


In [None]:
feature_imp = pd.DataFrame(sorted(zip(m.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:50], palette="Blues_d");

**Extra Tree Classifier**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators =700, max_depth =100)
etc.fit(X_train, y_train)

#prediction on the test dataset
y_pred = etc.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print('ETC Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

In [None]:
feature_imp = pd.DataFrame(sorted(zip(etc.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:50], palette="Blues_d");

**Stack Ensemble**

In [None]:
'''from sklearn.ensemble import StackingClassifier
estimator=[("light gbm",clf),("catboost",m), ("etc",etc)]
sc = StackingClassifier(estimators=estimator)

sc.fit(X_train,y_train)

#prediction on the test dataset
y_pred = sc.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print('Stacking Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))'''

**Model Fitting on Testing Data**

In [None]:
p_final_df.head()

In [None]:
test = p_final_df.drop(['tweet_id','user_id'], axis=1)

solution = m.predict(test)
solution_df = pd.concat([p_final_df[['tweet_id']], pd.DataFrame(solution, columns = ['virality'])], axis=1)
solution_df.head()

In [None]:
test_tweets.head()

In [None]:
# join tweets data on train_tweets
sub1 = pd.merge(test_tweets, solution_df, on = 'tweet_id')

In [None]:
sub1 = sub1[['tweet_id','virality']]
sub1.head()

In [None]:
sub1.to_csv("solution20.csv",index=False)

In [None]:
from IPython.display import FileLink
FileLink(r'solution20.csv')