## Stores Events Vocabularies to be used in A Matrices 

In [1]:
import numpy  as np
import pandas as pd
import pickle


In [2]:
pkl_file_path = 'train_w_time_10232023.pkl'

with open(pkl_file_path, 'rb') as file:
    train_data = pd.read_pickle(file)

train_df = pd.DataFrame(train_data)

pkl_file_path = 'test_w_time_10232023.pkl'

with open(pkl_file_path, 'rb') as file:
    test_data = pd.read_pickle(file)

test_df = pd.DataFrame(test_data)

In [3]:
# Assuming context_df and context_df_test are already defined Pandas DataFrames
concatenated_df = pd.concat([train_df, test_df], ignore_index=True)
concatenated_df.columns

Index(['tweet_id', 'lang', 'res_image_vectors', 'vgg_image_vectors',
       'tweet_text', 'label', 'event', 'new_img_id', 'clean_tweet', 'cl_len',
       'text_feats_1', 'text_feats_2', 'pooler', 'sm_last_four_concat',
       'mn_last_four_concat', 'mn_2ndtolast_lyr', 'sm_2ndtolast_lyr',
       'num_friends', 'num_followers', 'folfriend_ratio', 'times_listed',
       'has_url', 'is_verified', 'num_posts', 'num_words', 'text_length',
       'contains_questmark', 'num_questmark', 'contains_exclammark',
       'num_exclammark', 'contains_happyemo', 'contains_sademo',
       'contains_firstorderpron', 'contains_secondorderpron',
       'contains_thirdorderpron', 'num_uppercasechars', 'num_possentiwords',
       'num_negsentiwords', 'num_mentions', 'num_hashtags', 'num_URLs',
       'num_retweets', 'semi_clean', 'reading_ease', 'reading_grade',
       'compound_score', 'neg_score', 'pos_score', 'neutral_score',
       'vgg19_img_vectors', 'Topic20', 'Topic10', 'Topic5', 'caption',
       'c

In [4]:
with pd.option_context('display.max_colwidth', None):
  selected_columns = ['tweet_text', 'clean_tweet']
  print(concatenated_df[selected_columns].iloc[0])

tweet_text     Don't need feds to solve the #bostonbombing when we have #4chan!! http://t.co/eXQTPZqqbG
clean_tweet                                              [don, need, feds, solve, bostonbombing, 4chan]
Name: 0, dtype: object


In [5]:
len(concatenated_df['mdy'].unique())

386

In [6]:
concatenated_df.shape

(9938, 67)

In [7]:
concatenated_df['event'].unique()

array(['boston', 'malaysia', 'passport', 'sandy', 'sochi', 'bringback',
       'columbianChemicals', 'elephant', 'livr', 'pigFish', 'underwater',
       'eclipse', 'samurai', 'nepal', 'garissa', 'fuji_lenticular',
       'bush_book', 'gandhi_dancing', 'attacks_paris', 'bowie_david',
       'pakistan_explosion', 'refugees', 'protest', 'not_afraid',
       'rio_moon', 'immigrants', 'john_guevara', 'syrian_children',
       'mc_donalds', 'brussels_explosions', 'half_everything',
       'hubble_telescope', 'woman_14', 'north_korea', 'five_headed',
       'burst_kfc', 'black_lion', 'nazi_submarine'], dtype=object)

In [8]:
time_text = concatenated_df.groupby('mdy')
time_counts = time_text['tweet_text'].count()
print(time_counts.max())
print(time_counts.min())
time_counts.sum()

2324
1


9938

In [9]:
event_text = concatenated_df.groupby('event')
event_counts = event_text['event'].count()
event_counts

event
attacks_paris           181
black_lion                7
boston                  474
bowie_david              64
bringback               109
brussels_explosions       9
burst_kfc                15
bush_book                25
columbianChemicals      182
eclipse                 200
elephant                 11
five_headed               5
fuji_lenticular         147
gandhi_dancing           13
garissa                  44
half_everything          34
hubble_telescope         17
immigrants               30
john_guevara             10
livr                      2
malaysia                165
mc_donalds                5
nazi_submarine           10
nepal                  1250
north_korea               7
not_afraid               48
pakistan_explosion       34
passport                 41
pigFish                  12
protest                  34
refugees                 53
rio_moon                 26
samurai                 213
sandy                  6073
sochi                   259
syrian_childre

In [10]:
from datetime import datetime
def convert_datetime(date_time):

    formatted_date = date_time.strftime('%y%m%d')

    return formatted_date

In [11]:
text_time_t = {}
vocab_time_t = {}
total_vocab = []
all_unique_elements = set()

# Iterate through the grouped data
time_t = 0
for time_stamp, group in event_text:
  all_elements = []
  unique_elements = set()
  for elements in group['clean_tweet']:
    all_elements.extend(elements)
    unique_elements.update(elements)
  text_time_t[time_t] = all_elements
  vocab_time_t[time_t] = list(unique_elements)
  all_unique_elements.update(unique_elements)
  time_t += 1


total_vocab = list(all_unique_elements)

In [12]:
for t in vocab_time_t:
    print(len(vocab_time_t[t]), len(text_time_t[t]))

254 1585
17 51
962 4039
184 494
233 533
32 89
27 85
130 290
407 1441
325 1579
6 28
18 32
270 1177
64 113
177 459
37 301
45 149
167 270
18 67
16 20
426 1365
27 49
15 81
1060 11120
18 61
147 268
71 289
93 338
62 80
145 270
217 525
57 237
108 2060
4963 42637
800 2043
34 120
106 856
29 101


In [21]:
results_file = 'event_vocabs.pkl'
with open(results_file, 'wb') as f:
    pickle.dump(vocab_time_t, f)

In [None]:
len(total_vocab)

7791