## Show The Events with highest Vocabulary Size

In [14]:
import numpy  as np
import pandas as pd
import pickle



In [15]:
pkl_file_path = 'train_w_time_10232023.pkl'

with open(pkl_file_path, 'rb') as file:
    train_data = pd.read_pickle(file)

train_df = pd.DataFrame(train_data)

pkl_file_path = 'test_w_time_10232023.pkl'

with open(pkl_file_path, 'rb') as file:
    test_data = pd.read_pickle(file)

test_df = pd.DataFrame(test_data)

In [16]:
# Assuming context_df and context_df_test are already defined Pandas DataFrames
concatenated_df = pd.concat([train_df, test_df], ignore_index=True)
concatenated_df.columns

Index(['tweet_id', 'lang', 'res_image_vectors', 'vgg_image_vectors',
       'tweet_text', 'label', 'event', 'new_img_id', 'clean_tweet', 'cl_len',
       'text_feats_1', 'text_feats_2', 'pooler', 'sm_last_four_concat',
       'mn_last_four_concat', 'mn_2ndtolast_lyr', 'sm_2ndtolast_lyr',
       'num_friends', 'num_followers', 'folfriend_ratio', 'times_listed',
       'has_url', 'is_verified', 'num_posts', 'num_words', 'text_length',
       'contains_questmark', 'num_questmark', 'contains_exclammark',
       'num_exclammark', 'contains_happyemo', 'contains_sademo',
       'contains_firstorderpron', 'contains_secondorderpron',
       'contains_thirdorderpron', 'num_uppercasechars', 'num_possentiwords',
       'num_negsentiwords', 'num_mentions', 'num_hashtags', 'num_URLs',
       'num_retweets', 'semi_clean', 'reading_ease', 'reading_grade',
       'compound_score', 'neg_score', 'pos_score', 'neutral_score',
       'vgg19_img_vectors', 'Topic20', 'Topic10', 'Topic5', 'caption',
       'c

In [17]:
with pd.option_context('display.max_colwidth', None):
  selected_columns = ['tweet_text', 'clean_tweet']
  print(concatenated_df[selected_columns].iloc[0])

tweet_text     Don't need feds to solve the #bostonbombing when we have #4chan!! http://t.co/eXQTPZqqbG
clean_tweet                                              [don, need, feds, solve, bostonbombing, 4chan]
Name: 0, dtype: object


In [18]:
len(concatenated_df['mdy'].unique())

386

In [19]:
concatenated_df.shape

(9938, 67)

In [20]:
concatenated_df['event'].unique()

array(['boston', 'malaysia', 'passport', 'sandy', 'sochi', 'bringback',
       'columbianChemicals', 'elephant', 'livr', 'pigFish', 'underwater',
       'eclipse', 'samurai', 'nepal', 'garissa', 'fuji_lenticular',
       'bush_book', 'gandhi_dancing', 'attacks_paris', 'bowie_david',
       'pakistan_explosion', 'refugees', 'protest', 'not_afraid',
       'rio_moon', 'immigrants', 'john_guevara', 'syrian_children',
       'mc_donalds', 'brussels_explosions', 'half_everything',
       'hubble_telescope', 'woman_14', 'north_korea', 'five_headed',
       'burst_kfc', 'black_lion', 'nazi_submarine'], dtype=object)

In [21]:
time_text = concatenated_df.groupby('mdy')
time_counts = time_text['tweet_text'].count()
print(time_counts.max())
print(time_counts.min())
time_counts.sum()

2324
1


9938

In [22]:
event_text = concatenated_df.groupby('event')
event_counts = event_text['event'].count()
event_counts

event
attacks_paris           181
black_lion                7
boston                  474
bowie_david              64
bringback               109
brussels_explosions       9
burst_kfc                15
bush_book                25
columbianChemicals      182
eclipse                 200
elephant                 11
five_headed               5
fuji_lenticular         147
gandhi_dancing           13
garissa                  44
half_everything          34
hubble_telescope         17
immigrants               30
john_guevara             10
livr                      2
malaysia                165
mc_donalds                5
nazi_submarine           10
nepal                  1250
north_korea               7
not_afraid               48
pakistan_explosion       34
passport                 41
pigFish                  12
protest                  34
refugees                 53
rio_moon                 26
samurai                 213
sandy                  6073
sochi                   259
syrian_childre

In [23]:
from datetime import datetime
def convert_datetime(date_time):

    formatted_date = date_time.strftime('%y%m%d')

    return formatted_date

In [24]:
# List to hold integrated data
events_data = []
all_unique_elements = set()

# Iterate through the grouped data
time_t = 0
for event_n, group in event_text:
    all_elements = []
    unique_elements = set()
    for elements in group['clean_tweet']:
        all_elements.extend(elements)
        unique_elements.update(elements)

    # Create a dictionary for the current time point and add it to the list
    events_data.append({
        'time_t': time_t,
        'text': all_elements,
        'vocab': list(unique_elements),
        'event': event_n,
        'tweet_count': len(group)
    })

    # Update all unique elements
    all_unique_elements.update(unique_elements)

    # Increment time index
    time_t += 1

# Update total vocabulary
total_vocab = list(all_unique_elements)

# Sort integrated_data based on the length of 'vocab' in descending order
events_data.sort(key=lambda x: len(x['vocab']), reverse=True)

# Now integrated_data is sorted based on vocab size descending
for data in events_data[:19]:
    print(f"Time: {data['time_t']}, Event: {data['event']}, Tweet Count: {data['tweet_count']}, Vocab Size: {len(data['vocab'])}")


Time: 33, Event: sandy, Tweet Count: 6073, Vocab Size: 4963
Time: 23, Event: nepal, Tweet Count: 1250, Vocab Size: 1060
Time: 2, Event: boston, Tweet Count: 474, Vocab Size: 962
Time: 34, Event: sochi, Tweet Count: 259, Vocab Size: 800
Time: 20, Event: malaysia, Tweet Count: 165, Vocab Size: 426
Time: 8, Event: columbianChemicals, Tweet Count: 182, Vocab Size: 407
Time: 9, Event: eclipse, Tweet Count: 200, Vocab Size: 325
Time: 12, Event: fuji_lenticular, Tweet Count: 147, Vocab Size: 270
Time: 0, Event: attacks_paris, Tweet Count: 181, Vocab Size: 254
Time: 4, Event: bringback, Tweet Count: 109, Vocab Size: 233
Time: 30, Event: refugees, Tweet Count: 53, Vocab Size: 217
Time: 3, Event: bowie_david, Tweet Count: 64, Vocab Size: 184
Time: 14, Event: garissa, Tweet Count: 44, Vocab Size: 177
Time: 17, Event: immigrants, Tweet Count: 30, Vocab Size: 167
Time: 25, Event: not_afraid, Tweet Count: 48, Vocab Size: 147
Time: 29, Event: protest, Tweet Count: 34, Vocab Size: 145
Time: 7, Event: 

In [25]:
len(total_vocab)

7791

In [26]:
# Assuming integrated_data is already filled as per the previous discussion
df = pd.DataFrame(events_data)

# Adding a new column to count vocab size
df['vocab_size'] = df['vocab'].apply(len)

# Display the top 19 entries
top_19 = df[[ 'event', 'tweet_count', 'vocab_size']].head(19)

# Resetting the index to start from 1
top_19.index = range(1, len(top_19) + 1)

top_19

Unnamed: 0,event,tweet_count,vocab_size
1,sandy,6073,4963
2,nepal,1250,1060
3,boston,474,962
4,sochi,259,800
5,malaysia,165,426
6,columbianChemicals,182,407
7,eclipse,200,325
8,fuji_lenticular,147,270
9,attacks_paris,181,254
10,bringback,109,233
