### Module 2 - Twitter data

This is M2 of 3 modules for the twitter dataset. In this module, we cover the training parts to be used in M3 for prediction and inference for Q29. We extract the phrases, generate entities and finally create the dataset used for prediction.

In [1]:
import numpy as np
import random
import pandas as pd
import orjson as json
import time
from datetime import datetime

import regex as re
import spacy
import pytextrank
import multiprocessing as mp
from multiprocessing import Pool
import pickle
from fuzzywuzzy import fuzz

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

num_cores = 4 #number of cores on your machine
num_partitions = 16 #number of partitions to split dataframe

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7f9016fd1040>

In [2]:
## Load data
start_time = time.time()
files_tag = ['gohawks', 'gopatriots', 'nfl', 'patriots', 'sb49', 'superbowl']

df = pd.DataFrame()
tweet_id_ls = []
file_tag_ls = []
tweet_text_ls = []
tweet_time_ls = []

for file in files_tag:
    myJSON = []
    print("Reading file : "+file)
    for line in open('./data/ECE219_tweet_data/tweets_#'+file+'.txt','r'):
        data = json.loads(line)
        tweet_id_ls.append(data['tweet']['id_str'])
        file_tag_ls.append(file)
        tweet_text_ls.append(data['tweet']['text'])
        tweet_time_ls.append(data['citation_date'])

tweet_txt = pd.DataFrame({'tweet_id': tweet_id_ls, 'file': file_tag_ls,\
                          'text': tweet_text_ls, 'citation_datetime': tweet_time_ls})

print("done in %0.3fs." % (time.time() - start_time)) 

Reading file : gohawks
Reading file : gopatriots
Reading file : nfl
Reading file : patriots
Reading file : sb49
Reading file : superbowl
done in 90.407s.


In [3]:
def clean(text):
    '''
    Helps remove many HTML artefacts from the crawler's output.
    '''
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter

def text_preprocessing (text: str):
    text = text.lower()
    text = clean(text)
    return text

### Phrase extraction

In this part, we extract top 3 phrases from each tweet text using text rank after applying basic text cleaning

In [4]:
start_time = time.time()
print("Pre-processing tweet text for phrase extraction..")
tweet_txt['clean_text'] = tweet_txt['text'].apply(lambda x: text_preprocessing(x))
print("done in %0.3fs." % (time.time() - start_time)) 

Pre-processing tweet text for phrase extraction..
done in 108.820s.


In [5]:
def get_phrases (text: str, tweet_id : str, top_n = 3):
    '''
    Given a tweet id and text, returns top 3 phrases
    '''
    try:
        doc = nlp(text)
        phrases_dict = {}
        for phrase in doc._.phrases:
            phrases_dict[(tweet_id, phrase.text)] = phrase.rank

        a = dict(sorted(phrases_dict.items(), key=lambda x: x[1], reverse=True)[:top_n])
        return a

           
    except:
        print('Error for Tweet ID %s' % tweet_id)

In [6]:
phrase_extraction_df = tweet_txt[['tweet_id', 'clean_text']].drop_duplicates()

In [7]:
start_time = time.time()
final_dict = {}
for index, row in phrase_extraction_df.iterrows():
    tmp_dict = get_phrases(row['clean_text'], row['tweet_id'])
    final_dict.update(tmp_dict)
    if(index % 10000 == 0):
        print(index)
        print("done in %0.3fs." % (time.time() - start_time)) 
        output = open('./twitter_files_v3/textrank_phrases_v2.pkl', 'wb')
        pickle.dump(final_dict, output)

0
done in 0.017s.
10000
done in 64.451s.
20000
done in 126.352s.
30000
done in 185.181s.
40000
done in 242.352s.
50000
done in 297.662s.
60000
done in 355.376s.
70000
done in 417.387s.
80000
done in 480.085s.
90000
done in 545.729s.
100000
done in 612.380s.
110000
done in 673.488s.
120000
done in 738.426s.
130000
done in 797.466s.
140000
done in 853.193s.
150000
done in 910.946s.
160000
done in 967.202s.
170000
done in 1030.363s.
180000
done in 1086.957s.
190000
done in 1143.337s.
200000
done in 1209.398s.
210000
done in 1270.400s.
220000
done in 1330.391s.
230000
done in 1397.330s.
240000
done in 1462.670s.
260000
done in 1593.722s.
270000
done in 1663.351s.
280000
done in 1732.864s.
290000
done in 1800.887s.
300000
done in 1867.445s.
310000
done in 1933.838s.
320000
done in 1995.052s.
330000
done in 2056.889s.
340000
done in 2120.953s.
350000
done in 2177.911s.
360000
done in 2234.914s.
370000
done in 2303.198s.
380000
done in 2371.161s.
390000
done in 2436.136s.
400000
done in 2504.

In [12]:
output = open('./twitter_files_v3/textrank_phrases_v2.pkl', 'wb')
pickle.dump(final_dict, output)

### Phrase post-processing

In this part, we process the extracted phrases and do some cleaning on extracted phrases for subsequent tasks

In [13]:
txtrank_phrase_file = open('./twitter_files_v3/textrank_phrases_v2.pkl', "rb")
txtrank_phrases = pickle.load(txtrank_phrase_file)

In [14]:
#### Drop if all numeric
def all_numeric(number_sequence: str):
     return all(var.isdigit() for var in number_sequence.split())

def drop_allNumeric (data):
    data['allNumeric'] = data['clean_phrase'].apply(lambda x: all_numeric(str(x)))
    data = data[data['allNumeric'] == False]
    data = data.drop(['allNumeric'], axis=1)
    return data

In [15]:
def clean(text: str):
    
    # remove punctuation
    text = re.sub('[!"#$%&()*+-/:;<=>?@[\\]^_`{|}~]', '', text)
    
    # remove the from beginning
    if (text.startswith("the")):
        text = text.replace("the", '', 1)
        
    # remove http tokens
    tokens = text.split(' ')
    tokens_filt = [i for i in tokens if not i.startswith('http')]
    text = ' '.join(tokens_filt)
    
    # remove trailing spaces
    text = text.rstrip()
    text = text.lstrip()
    
    return text

def remove_stopwords(data):
    phrase_counts = data.groupby(['clean_phrase']).size().reset_index(name = 'count')
    phrase_counts = phrase_counts.sort_values('count', ascending = False)
    phrase_counts['len'] = phrase_counts['clean_phrase'].apply(lambda x: len(x.split(' ')))
    phrase_counts['single_stopword_tag'] = phrase_counts.apply(lambda row: int(row['clean_phrase'] in stopwords) \
                                                         if (row['len'] == 1) else 0, axis = 1)
    data = pd.merge(data, phrase_counts[['clean_phrase', 'single_stopword_tag']], how = 'left', on = 'clean_phrase')
    data = data[data['single_stopword_tag'] == 0]
    data = data.drop(['single_stopword_tag'], axis = 1)
    return data
    

def clean_phrases (txtrank_phrases):
    '''
    Given the dictionary of textrank, tweet phrases; clean the phrases to get better entities.
    Returns a dataframe `txtrank_dt` with columns - tweet_id, ranking score, phrase and other tweet info
    '''
    
    ## Read phrases
    txtrank_df = pd.DataFrame.from_dict(txtrank_phrases, orient = 'index').reset_index()
    txtrank_df[['tweet_id', 'phrase']] = pd.DataFrame(txtrank_df['index'].tolist(),\
                                                  index=txtrank_df.index)
    txtrank_df = txtrank_df.drop_duplicates()
    
    ## clean phrase
    txtrank_df['clean_phrase'] = txtrank_df['phrase'].apply(lambda x: clean(x))
    
    ##drop single word stopword phrases
    txtrank_df = remove_stopwords(txtrank_df)
    
    ## remove if length of phrase < 2
    txtrank_df['len'] = txtrank_df['clean_phrase'].apply(lambda x: len(x))
    txtrank_df = txtrank_df[txtrank_df['len'] > 2]
    txtrank_df = txtrank_df.drop(['len'], axis=1)
    
    ## get count of phrases and drop if count == 1
    phrase_counts = txtrank_df.groupby(['clean_phrase']).size().reset_index(name = 'count')
    phrase_counts = phrase_counts.sort_values('count', ascending = False)
    txtrank_df = pd.merge(txtrank_df, phrase_counts, how = 'left', on = 'clean_phrase')
    txtrank_df = txtrank_df[txtrank_df['count'] > 1]
    txtrank_df = txtrank_df.drop(['count'], axis=1)
    
    return txtrank_df

In [16]:
start_time = time.time()
print("Post-processing extracted phrases...")
txtrank_df = clean_phrases(txtrank_phrases)
txtrank_df = drop_allNumeric(txtrank_df)
print("done in %0.3fs." % (time.time() - start_time)) 

Post-processing extracted phrases...
done in 76.439s.


### Entity extraction

In this part, we process the phrases to determine whether they are a valid entity

In [17]:
def get_phrase_counts_overall (txtrank_df):
    '''
    Given the text rank dataframe with clean tweet phrases; get the frequency of how many times a phrase was 
    used in the entire tweet dataset to assess popular phrases/entities.
    Returns a dataframe `phrase_counts` with columns - `clean_phrase` (the phrase) and `count` (num of occurences)
    '''
    
    phrase_counts = txtrank_df.groupby(['clean_phrase']).size().reset_index(name = 'count')
    phrase_counts = phrase_counts.sort_values('count', ascending = False)
    
    return phrase_counts

def get_close_entities(allphrases : list, entity : str, threshold = 10):
    '''
    Given a phrase, map other phrases to this phrase using fuzzy text matching
    '''
    close_entities = []
    for phrase in allphrases:
        val = fuzz.ratio(phrase, entity)
        if(val > 85):
            close_entities.append(phrase)
    
    if(len(close_entities) > threshold):
        return close_entities
    else:
        return "Not an entity"

In [18]:
phrase_counts = get_phrase_counts_overall(txtrank_df)
phrase_counts = phrase_counts.sort_values('count', ascending = False)
phrase_counts['entity'] = 'NA'

In [19]:
phrase_counts.head()

Unnamed: 0,clean_phrase,count,entity
154665,sb49,528347,
173591,superbowl,318933,
123949,nfl,235215,
176634,superbowlxlix,185221,
136621,patriots,176941,


In [20]:
start_time = time.time()

tmp = phrase_counts[phrase_counts['count'] > 20]
tmp = tmp.sort_values('count', ascending = False)
prospect_entities = list(tmp['clean_phrase'])

entity_dict = {}
entity_key = 1
counter = 0
ind = 0
for phrase in prospect_entities:
    counter +=1
    if(counter % 1000 == 0):
        counter = 0
        ind += 1
        print("Completed for: ", ind)
        print("done in %0.3fs." % (time.time() - start_time)) 
        
    entity_val = list(phrase_counts[phrase_counts['clean_phrase'] == phrase]['entity'])[0]
    
    if(entity_val == 'NA'):
        allphrases = list(phrase_counts[phrase_counts['entity'] == 'NA']['clean_phrase'])
        close_entities = get_close_entities(allphrases, phrase)
        if(close_entities == 'Not an entity'):
            phrase_counts.loc[ (phrase_counts['clean_phrase'] == phrase), 'entity'] = 'Not an entity'
        else:
            phrase_counts.loc[ (phrase_counts['clean_phrase'].isin(close_entities)), 'entity'] = phrase
            entity_dict[entity_key] = phrase
            entity_key = entity_key+1

Completed for:  1
done in 518.509s.
Completed for:  2
done in 1021.617s.
Completed for:  3
done in 1576.296s.
Completed for:  4
done in 2119.950s.
Completed for:  5
done in 2653.637s.
Completed for:  6
done in 3188.805s.
Completed for:  7
done in 3717.095s.
Completed for:  8
done in 4245.117s.
Completed for:  9
done in 4768.835s.
Completed for:  10
done in 5289.821s.
Completed for:  11
done in 5802.139s.
Completed for:  12
done in 6289.340s.
Completed for:  13
done in 6772.206s.
Completed for:  14
done in 7280.914s.


In [21]:
output = open('./twitter_files_v3/entities_v2.pkl', 'wb')
pickle.dump(entity_dict, output)
output.close()

In [22]:
output = open('./twitter_files_v3/clean_phrase_to_entity_v2.pkl', 'wb')
pickle.dump(phrase_counts, output)
output.close()

### Get data for prediction tasks

In [23]:
data = pd.merge(txtrank_df, phrase_counts, how = 'left', on = 'clean_phrase')
## merge tweet text and time
data = pd.merge(data, tweet_txt, how = 'left', on = 'tweet_id')
output = open('./twitter_files_v3/prediction_data_v2.pkl', 'wb')
pickle.dump(data, output)
output.close()

In [24]:
entity_dict

{1: 'superbowl',
 2: 'superbowlxlix',
 3: 'patriots',
 4: 'seahawks',
 5: 'gohawks',
 6: 'patriotswin nfl',
 7: 'katyperry',
 8: 'tom brady',
 9: 'seattle',
 10: 'halftime',
 11: 'football',
 12: 'pats',
 13: 'superbowlcommercials',
 14: 'gopats',
 15: 'superbowlsunday',
 16: 'seattleseahawks',
 17: 'touchdown',
 18: 'commercials',
 19: 'new england',
 20: 'superbowl2015',
 21: 'marshawn lynch',
 22: 'katy',
 23: 'patsnation',
 24: 'new england patriots',
 25: 'missyelliott',
 26: 'budweiser',
 27: 'this game',
 28: 'sb49 superbowl',
 29: 'patriotsnation',
 30: 'russell wilson',
 31: 'katyperry superbowl',
 32: 'packers',
 33: 'wilson',
 34: 'people',
 35: 'halftime show',
 36: 'pete carroll',
 37: 'america',
 38: 'chris matthews',
 39: 'beastmode',
 40: 'dangerusswilson',
 41: 'allyouneedisecuador',
 42: 'nflplayoffs',
 43: 'lenny kravitz',
 44: 'bill belichick',
 45: 'last year',
 46: 'next year',
 47: 'los',
 48: 'tom',
 49: 'national anthem',
 50: 'belichick',
 51: 'interception',
