In [1]:
import boto3
import zipfile
import json
import io
import spacy
import pandas as pd
import spacy_fastlang
import matplotlib.pyplot as plt
from spacy.language import Language

In [2]:
s3_host = 'https://obj.umiacs.umd.edu'
access_key_id = "xxxxx"
secret_access_key = "xxxxx"

s3 = boto3.client('s3', 
                  endpoint_url=s3_host, 
                  aws_access_key_id=access_key_id, 
                  aws_secret_access_key=secret_access_key)

buntain = 'buntain'

In [3]:
model_dict = {'en': spacy.load('en_core_web_sm'), 'weaker': spacy.load('xx_ent_wiki_sm'), 'better':spacy.load('xx_sent_ud_sm'), 
              'zh': spacy.load('zh_core_web_sm'), 'ja': spacy.load('ja_core_news_sm'), 'es': spacy.load('es_core_news_sm'), 
              'ru': spacy.load('ru_core_news_sm'), 'fr': spacy.load('fr_core_news_sm'), 'ro': spacy.load('ro_core_news_sm'),
              'nl': spacy.load('nl_core_news_sm'), 'pl': spacy.load('pl_core_news_sm'), 'pt': spacy.load('pt_core_news_sm'),
              'fi': spacy.load('fi_core_news_sm'), 'de': spacy.load('de_core_news_sm'), 'ca': spacy.load('ca_core_news_sm')}
model_dict['better'].add_pipe("language_detector")

<spacy_fastlang.LanguageDetector at 0x2d39e6990>

In [4]:
def process(folder_prefix):
# def process():
    entity_dict = dict()
    response = s3.list_objects_v2(Bucket=buntain, Prefix=folder_prefix)
    i = 0
    count = 0
    files = []
    skipped = 0
    for obj in response.get('Contents', []):
        i += 1
        object_key = obj['Key']
        if object_key.endswith('.zip'):
            files.append(object_key)
    print(f"number of files: {i}")
    for file in files:
        zip_object = s3.get_object(Bucket=buntain, Key=file)
        zip_contents = zip_object['Body'].read()
        zip_file = zipfile.ZipFile(io.BytesIO(zip_contents), 'r')
        for file_info in zip_file.infolist():
            with zip_file.open(file_info) as json_file:
                file_name = file_info.filename                    
                if not file_name.endswith("-tweet.json"):
                    continue
                try:
                    json_data = json_file.read().decode('utf-8')
                except: 
                    print("this is a text file")
                parsed_data = json.loads(json_data)
                for ind_data in parsed_data:
                    count += 1    
                    raw_text = ind_data['tweet']['tweet_text']
                    if 'tweet_language' not in ind_data['tweet']:
                        NER = model_dict['better']
                        ner_text = NER(raw_text)
                        if ner_text._.language not in entity_dict:
                            for word in ner_text.ents:
                                tuple = (word.text, word.label_, lang)
                                if tuple in entity_dict:
                                    entity_dict[tuple]+=1
                                else:
                                    entity_dict[tuple]=1
                            continue
                        else:
                            lang = NER._.language
                    lang = ind_data['tweet']['tweet_language'] 
                    if lang in model_dict:
                        NER = model_dict[lang]
                    else:
                        NER = model_dict['better']
                    ner_text = NER(raw_text)
                    if (len(ner_text.ents) == 0):
                        skipped+=1
                    for word in ner_text.ents:
                        tuple = (word.text, word.label_, lang)
                        if tuple in entity_dict:
                            entity_dict[tuple]+=1
                        else:
                            entity_dict[tuple]=1
                        # print(word.text, word.label_)
    print(f"how many lines: {count}")
    print(f"how many skipped: {skipped}")

    return entity_dict
    # return pd.to_datetime(dates)

In [5]:
folder_prefix_lst = ['twitter.tmrc/August_2022/TMRC14_APAC_1/', 'twitter.tmrc/August_2022/TMRC14_APAC_2/', 'twitter.tmrc/October_2022/TMRC15_APAC_3/']
dict_lst = []
for f in folder_prefix_lst:
    dict_lst.append(process(f))


number of files: 1000
how many lines: 66288
how many skipped: 65261
number of files: 1000
how many lines: 274408
how many skipped: 105225
number of files: 568
how many lines: 4418374
how many skipped: 2828901


In [7]:
# dict to df
# print(len(dict_lst))
df_lst = []
for d in dict_lst:
    # print(keys)
    if len(d.keys()) == 0:
        entity_df = pd.DataFrame()
    else:
        keys = list(d.keys())
        key_tuples = list(zip(*keys))  # Unpack tuples into separate lists

        # Create a DataFrame
        entity_df = pd.DataFrame({
            'word': key_tuples[0],
            'type': key_tuples[1],
            'lang': key_tuples[2],
            'count': list(d.values())
        })
    df_lst.append(entity_df)

In [8]:
if  len(df_lst[0]) > 0:
    print(df_lst[0].sort_values(by='count', ascending=False))

                                 word      type lang  count
85                                  #  CARDINAL   en    114
30                         RT @ntsana       ORG   en     50
104                              tysm    PERSON   en     27
67                                hai       GPE   en     23
6                                Army       ORG   en     14
...                               ...       ...  ...    ...
626   RT @misterj0423: Dive Into You✨       ORG   en      1
623                          aman sii      MISC   es      1
622    pesa ya shopping imeishia hapo    PERSON   en      1
621                            Mutura       GPE   en      1
1483                         the days      DATE   en      1

[1484 rows x 4 columns]


In [9]:
print(df_lst[1].sort_values(by='count', ascending=False))

                       word      type lang  count
165                       #  CARDINAL   en  37930
6                  Pakistan       GPE   en  27865
30                  Kashmir       LOC   en  25989
43                    China       GPE   en  11739
77                    India       GPE   en  11479
...                     ...       ...  ...    ...
56511               Laigroo       ORG   en      1
56510            Rakhchikri       ORG   en      1
56509   Khalistan Terrorist       ORG   en      1
56508           22-Year-Old      DATE   en      1
128324       Rajasthan BSTC       GPE   en      1

[128325 rows x 4 columns]


In [10]:
print(df_lst[2].sort_values(by='count', ascending=False))
# print(df_lst[2]["count"].sum())

                         word      type lang   count
58                   Pakistan       GPE   en  191550
75                          #  CARDINAL   en  167046
73                      India       GPE   en   70518
596                   Kashmir       LOC   en   63426
244                    Indian      NORP   en   52916
...                       ...       ...  ...     ...
324401   @ApKaBhai420 @_Zehar       PER   es       1
324400      @ApKaBhai420 Acha       ORG   en       1
324399  @loverofrapunzel Week       ORG   en       1
324397  @ApKaBhai420 Icecream       ORG   en       1
710778           Nadia Sister       ORG   en       1

[710779 rows x 4 columns]


In [11]:
# number of files: 1000
# how many lines: 66288
# how many skipped: 31320
# number of files: 1000
# how many lines: 274408
# how many skipped: 86602
# number of files: 568
# how many lines: 4418374
# how many skipped: 361433
print(f"first files total count: {df_lst[0]['count'].sum()}")
print(f"first files total count: {df_lst[1]['count'].sum()}")
print(f"first files total count: {df_lst[2]['count'].sum()}")


first files: 2294
first files: 666637
first files: 4552895


In [12]:
i = 0
name = ['TMRC14_APAC_1', 'TMRC14_APAC_2', 'TMRC15_APAC_3']
for time_df in df_lst:
    time_df.to_csv(f"~/Coding/buntain/named_entity_data/{name[i]}.csv", index=False)
    i+=1