In [1]:
import pandas as pd
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

idx = pd.IndexSlice
data_fol = r"..\data_files"

In [45]:
from sklearn.model_selection import train_test_split

In [39]:
df_eng = pd.read_pickle(os.path.join(data_fol, "dataset_no_outliers_engagement.pkl"))
df_bow = pd.read_pickle(os.path.join(data_fol, "tfidf_BoW_180.pkl"))
df_2g = pd.read_pickle(os.path.join(data_fol, 'tfidf_2grams_180.pkl'))
df_23g = pd.read_pickle(os.path.join(data_fol, 'tfidf_23gram_180.pkl'))

In [40]:
df_eng.shape, df_bow.shape, df_2g.shape, df_23g.shape

((17924, 11), (17924, 180), (17924, 180), (17924, 180))

#### first, preprocess the engagement statistics dataframe

In [4]:
df_eng.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,quote_count,reply_count,retweet_count,favorite_count,user_followers_count,user_verified,text,media_count,media_types,active_engagement,passive_engagement
tid,uid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1333476068192366593,2152363202,0,2,3,16,6070,False,"Teen pregnancy is high, HIV infection rate is ...",0,none,2,19
1364161232270487553,1331332400836726785,0,0,0,0,62,False,Even though it was a charity. Stevens thought...,0,none,0,0
1364161201291153414,150301804,0,0,0,0,1044,False,SCORA\nStanding Committee on Sexual &amp; Repr...,0,none,0,0
1364161184505737217,1317091162159747075,0,0,0,0,15,False,"many females are HIV+, we wish you well, bless...",0,none,0,0
1363439109948149760,1353391442408792066,0,0,2,24,184,False,@TheRustler83 Yep. Imagine if the government w...,0,none,0,26


In [9]:
engagement = df_eng.reset_index().drop(columns=['quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'uid', 'text'])
engagement.head()

Unnamed: 0,tid,user_followers_count,user_verified,media_count,media_types,active_engagement,passive_engagement
0,1333476068192366593,6070,False,0,none,2,19
1,1364161232270487553,62,False,0,none,0,0
2,1364161201291153414,1044,False,0,none,0,0
3,1364161184505737217,15,False,0,none,0,0
4,1363439109948149760,184,False,0,none,0,26


In [10]:
engagement['all_engagement'] = engagement['active_engagement']*.25 + engagement['passive_engagement']*.75

In [27]:
engagement['user_verified'] = engagement['user_verified'].astype(int)

In [18]:
engagement['media_types'] = engagement['media_types'].replace("none", np.nan)

In [23]:
dummies_df = pd.get_dummies(engagement['media_types'], prefix='media')
dummies_df

Unnamed: 0,media_animated_gif,media_photo,media_video
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
17919,0,0,0
17920,0,0,0
17921,0,0,0
17922,0,0,0


In [28]:
engagement = pd.concat([engagement, dummies_df], axis=1)
engagement.head()

Unnamed: 0,tid,user_followers_count,user_verified,media_count,media_types,active_engagement,passive_engagement,all_engagement,media_animated_gif,media_photo,media_video
0,1333476068192366593,6070,0,0,,2,19,14.75,0,0,0
1,1364161232270487553,62,0,0,,0,0,0.0,0,0,0
2,1364161201291153414,1044,0,0,,0,0,0.0,0,0,0
3,1364161184505737217,15,0,0,,0,0,0.0,0,0,0
4,1363439109948149760,184,0,0,,0,26,19.5,0,0,0


In [30]:
engagement.drop(columns=['media_types', 'active_engagement', 'passive_engagement'], inplace=True)

In [34]:
engagement.set_index('tid', inplace=True)

In [35]:
engagement.head()

Unnamed: 0_level_0,user_followers_count,user_verified,media_count,all_engagement,media_animated_gif,media_photo,media_video
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1333476068192366593,6070,0,0,14.75,0,0,0
1364161232270487553,62,0,0,0.0,0,0,0
1364161201291153414,1044,0,0,0.0,0,0,0
1364161184505737217,15,0,0,0.0,0,0,0
1363439109948149760,184,0,0,19.5,0,0,0


In [36]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17924 entries, 1333476068192366593 to 1343490641230233601
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_followers_count  17924 non-null  Int64  
 1   user_verified         17924 non-null  int32  
 2   media_count           17924 non-null  Int64  
 3   all_engagement        17924 non-null  float64
 4   media_animated_gif    17924 non-null  uint8  
 5   media_photo           17924 non-null  uint8  
 6   media_video           17924 non-null  uint8  
dtypes: Int64(2), float64(1), int32(1), uint8(3)
memory usage: 717.7 KB


In [37]:
engagement.describe()

Unnamed: 0,user_followers_count,user_verified,media_count,all_engagement,media_animated_gif,media_photo,media_video
count,17924.0,17924.0,17924.0,17924.0,17924.0,17924.0,17924.0
mean,16839.21,0.059864,0.031522,8.245746,0.002008,0.023042,0.002957
std,205426.6,0.237241,0.204724,29.160635,0.044772,0.15004,0.054299
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,139.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,716.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,2934.25,0.0,0.0,4.25,0.0,0.0,0.0
max,18042460.0,1.0,4.0,463.25,1.0,1.0,1.0


In [42]:
engagement_bow = pd.concat([engagement, df_bow], axis=1)
engagement_bow.head()

Unnamed: 0_level_0,user_followers_count,user_verified,media_count,all_engagement,media_animated_gif,media_photo,media_video,2021,80,access,...,want,way,week,woman,work,world,year,yes,young,youre
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1333476068192366593,6070,0,0,14.75,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161232270487553,62,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161201291153414,1044,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161184505737217,15,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1363439109948149760,184,0,0,19.5,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
engagement_2g = pd.concat([engagement, df_2g], axis=1)
engagement_2g.head()

Unnamed: 0_level_0,user_followers_count,user_verified,media_count,all_engagement,media_animated_gif,media_photo,media_video,260 rape,30 year,40 year,...,virus hiv,wife husbandsall,woman girl,woman hiv,woman living,work hiv,world test,year ago,year old,young people
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1333476068192366593,6070,0,0,14.75,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161232270487553,62,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161201291153414,1044,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161184505737217,15,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1363439109948149760,184,0,0,19.5,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
engagement_23g = pd.concat([engagement, df_23g], axis=1)
engagement_23g.head()

Unnamed: 0_level_0,user_followers_count,user_verified,media_count,all_engagement,media_animated_gif,media_photo,media_video,260 rape,260 rape victim,30 year,...,wife husbandsall usb,woman girl,woman girl hivaids,woman hiv,woman living,work hiv,world test,world test hiv,year ago,young people
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1333476068192366593,6070,0,0,14.75,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161232270487553,62,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161201291153414,1044,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1364161184505737217,15,0,0,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1363439109948149760,184,0,0,19.5,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
def get_train_test_split(dataframe, y_col, test_size=.33):
    X = dataframe.drop(columns=y_col)
    y = dataframe[y_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [50]:
def save_train_test_data(save_loc, prefix, dataframe, y_col, test_size=.33):
    X_train, X_test, y_train, y_test = get_train_test_split(dataframe, y_col, test_size)
    if os.path.exists(save_loc)==False:
        os.mkdir(save_loc)
    
    pd.to_pickle(X_train, os.path.join(save_loc, "{}_X_train.pkl".format(prefix)))
    pd.to_pickle(X_test, os.path.join(save_loc, "{}_X_test.pkl".format(prefix)))
    pd.to_pickle(y_train, os.path.join(save_loc, "{}_y_train.pkl".format(prefix)))
    pd.to_pickle(y_test, os.path.join(save_loc, "{}_y_test.pkl".format(prefix)))
    

In [51]:
save_train_test_data(save_loc=os.path.join(data_fol, "bow_model"), prefix="bow", dataframe=engagement_bow, y_col="all_engagement")

In [53]:
save_train_test_data(save_loc=os.path.join(data_fol, "bigrams_model"), prefix="gram2", dataframe=engagement_2g, y_col="all_engagement")

In [54]:
save_train_test_data(save_loc=os.path.join(data_fol, "bitrigrams_model"), prefix="gram23", dataframe=engagement_23g, y_col="all_engagement")