In [1]:
import boto3
import zipfile
import json
import io
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from io import BytesIO

In [2]:
s3_host = 'https://obj.umiacs.umd.edu'
access_key_id = "xxxxx"
secret_access_key = "xxxxx"

s3 = boto3.client('s3', 
                  endpoint_url=s3_host, 
                  aws_access_key_id=access_key_id, 
                  aws_secret_access_key=secret_access_key)

bucket = 'twitter.ei'

In [3]:
model_dict = {'en': spacy.load('en_core_web_sm'), 'weaker': spacy.load('xx_ent_wiki_sm'), 'better':spacy.load('xx_sent_ud_sm'), 
              'pt': spacy.load('pt_core_news_sm'), 'zh': spacy.load('zh_core_web_sm'), 'es': spacy.load('es_core_news_sm'),
              'ko': spacy.load('ko_core_news_sm'), 'ja': spacy.load('ja_core_news_sm'), 'fr': spacy.load('fr_core_news_sm'),
              'ru': spacy.load('ru_core_news_sm')}

In [4]:
def process(file):
    # dates = []
    zip_object = s3.get_object(Bucket=bucket, Key=file)
    zip_contents = zip_object['Body'].read()
    zip_file = zipfile.ZipFile(io.BytesIO(zip_contents), 'r')
    for file_info in zip_file.infolist():
        with zip_file.open(file_info) as csv_file:
            df = None
            try:
                if df == None:
                    df = pd.read_csv(csv_file)
                else:
                    df = pd.concat([df, pd.read_csv(csv_file)], axis=0)
            except:
                print(f"{csv_file.filename} is not a csv file")
    return df





In [5]:
def helper(row, entity_dict):
    lang = row['tweet_language'] 
    raw_text = row['tweet_text']
    if type(raw_text) != str:   ## one entry that has "nan" as the tweet text
        print("this is not a string")
        print(raw_text)
        raw_text = str(raw_text)
    if lang in model_dict:
        NER = model_dict[lang]
    else:
        NER = model_dict['better']
    ner_text = NER(raw_text)
    for word in ner_text.ents:
        tuple = (word.text, word.label_, lang)
        if tuple in entity_dict:
            entity_dict[tuple]+=1
        else:
            entity_dict[tuple]=1

In [6]:
folder_prefix_lst = ["2019_08/china_082019_1/china_082019_1_tweets_csv_unhashed.zip",
                     "2019_08/china_082019_2/china_082019_2_tweets_csv_unhashed.zip",
                     "2019_08/china_082019_3/china_082019_3_tweets_csv_unhashed.zip",
                     "2020_05/china_052020/china_052020_tweets_csv_unhashed.zip", 
                     "2020_09/thailand_092020/thailand_092020_tweets_csv_unhashed.zip"]
dict_lst = []

for f in folder_prefix_lst:
    print(f)
    df = process(f)
    entity_dict = dict()
    df.apply(lambda row: helper(row, entity_dict), axis=1)
    dict_lst.append(entity_dict)



2019_08/china_082019_3/china_082019_3_tweets_csv_unhashed.zip


  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)


this is not a string
nan


In [7]:
# dict to df
df_lst = []
for d in dict_lst:
    keys = list(d.keys())
    key_tuples = list(zip(*keys))  # Unpack tuples into separate lists

    # Create a DataFrame
    entity_df = pd.DataFrame({
        'word': key_tuples[0],
        'type': key_tuples[1],
        'lang': key_tuples[2],
        'count': list(d.values())
    })
    df_lst.append(entity_df)

In [8]:
df_lst[0] = df_lst[0].sort_values(by='count', ascending=False)
print(df_lst[0])

                             word         type lang   count
18                             香港          GPE   zh  169678
106                           郭文贵       PERSON   zh  140805
1                              中国          GPE   zh   70395
67                             美国          GPE   zh   53575
570                         Likes       PERSON   en   26110
...                           ...          ...  ...     ...
203634                        9IG     QUANTITY   zh       1
203632             RT @jentikhati          ORG   en       1
203631  Best International Artist  WORK_OF_ART   en       1
203630                        一证人     CARDINAL   zh       1
457604              co/cPPww3A3fJ       PERSON   zh       1

[457605 rows x 4 columns]


In [27]:
df_lst[1] = df_lst[1].sort_values(by='count', ascending=False)
print(df_lst[1])

                             word      type lang  count
11                       Thailand       GPE   en     23
18                              #  CARDINAL   en     16
27                           Thai      NORP   en     10
12                  Queen Suthida    PERSON   en     10
44   Grand Royal Barge Procession       ORG   en      8
..                            ...       ...  ...    ...
97                         Madrid       GPE   en      1
98                     last night      TIME   en      1
99                          Spoti    PERSON   en      1
100                             🌲    PERSON   en      1
216           New Eastern Outlook      NORP   en      1

[217 rows x 4 columns]


In [28]:
df_lst[2].sort_values(by='count', ascending=False)
print(df_lst[2])

IndexError: list index out of range

In [None]:
df_lst[3].sort_values(by='count', ascending=False)
print(df_lst[3])

In [None]:
df_lst[4].sort_values(by='count', ascending=False)
print(df_lst[4])

In [9]:
title = ["china_082019_1", "china_082019_2", "china_082019_3", "china_052020", "thailand_092020"]

i = 0
for time_df in df_lst:
    time_df.to_csv(f"~/Coding/buntain/named_entity_data/twitterrei_{title[i]}.csv", index=False)
    i+=1