In [1]:
import boto3
import zipfile
import json
import io
import spacy
import csv
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
## s3 host to access data from UMIACS
s3_host = 'https://obj.umiacs.umd.edu'
access_key_id = "xxxxx"
secret_access_key = "xxxxx"

s3 = boto3.client('s3', 
                  endpoint_url=s3_host, 
                  aws_access_key_id=access_key_id, 
                  aws_secret_access_key=secret_access_key)


In [3]:
## processes tmrc dataset
def process_tmrc(folder_prefix):
    ## we will collect the data with a list then create df at the end (most efficient with runtime)
    char_count = 0
    word_count = 0
    sentence_count = 0
    response = s3.list_objects_v2(Bucket="twitter.tmrc", Prefix=folder_prefix)
    files = []
    ## collect all the filenames to be processed 
    for obj in response.get('Contents', []):
        object_key = obj['Key']
        if object_key.endswith('.zip'):
            files.append(object_key)
    ##  process each file
    for file in files:
        zip_object = s3.get_object(Bucket="twitter.tmrc", Key=file)
        zip_contents = zip_object['Body'].read()
        zip_file = zipfile.ZipFile(io.BytesIO(zip_contents), 'r')
        for file_info in zip_file.infolist():
            with zip_file.open(file_info) as json_file:
                file_name = file_info.filename                    
                ## we are only interested in the tweet file
                if not file_name.endswith("-tweet.json"):
                    continue
                try:
                    json_data = json_file.read().decode('utf-8')
                except: 
                    print("this is a text file")
                parsed_data = json.loads(json_data)
                for ind_data in parsed_data:
                    tweet = ind_data["tweet"]["tweet_text"]
                    # print(tweet)
                    char_count += len(tweet)
                    word_count += len(tweet.split(" "))
                    sentence_count += 1
    return (char_count, word_count, sentence_count)

In [4]:
## counting helper to use apply
def helper(tweet, counts):
    raw_text = tweet["tweet_text"]
    if type(raw_text) != str:   ## one entry that has "nan" as the tweet text
        raw_text = str(raw_text)
    counts['char_count'] += len(raw_text)
    counts['word_count'] += len(raw_text.split(" "))
    counts["sentence_count"] += 1
    return


In [5]:
def getdf_twitterei(file):
    zip_object = s3.get_object(Bucket='twitter.ei', Key=file)
    zip_contents = zip_object['Body'].read()
    zip_file = zipfile.ZipFile(io.BytesIO(zip_contents), 'r')
    for file_info in zip_file.infolist():
        with zip_file.open(file_info) as csv_file:
            df = None
            try:
                if df == None:
                    df = pd.read_csv(csv_file)
                else:
                    df = pd.concat([df, pd.read_csv(csv_file)], axis=0)
            except:
                print(f"{csv_file.filename} is not a csv file")
    df["tweet_id"] = df["tweetid"]
    return df[["tweet_id", "tweet_language", "tweet_text"]]

def process_twitterei(df):
    counts = {'char_count': 0, 'word_count': 0, 'sentence_count': 0}
    df.apply(lambda row: helper(row, counts), axis=1)
    return (counts['char_count'], counts['word_count'], counts['sentence_count'])




In [6]:
tmrc_folder_prefix_lst = ['August_2022/TMRC14_APAC_1/', 
                          'August_2022/TMRC14_APAC_2/', 
                          'October_2022/TMRC15_APAC_3/']
twitterei_folder_prefix_lst = ["2019_08/china_082019_1/china_082019_1_tweets_csv_unhashed.zip",
                               "2019_08/china_082019_2/china_082019_2_tweets_csv_unhashed.zip",
                               "2019_08/china_082019_3/china_082019_3_tweets_csv_unhashed.zip",
                               "2020_05/china_052020/china_052020_tweets_csv_unhashed.zip", 
                               "2020_09/thailand_092020/thailand_092020_tweets_csv_unhashed.zip"]

## this dict will be in the format of campaign_name, data
campaign_data_dict = dict()


In [7]:
for f in tmrc_folder_prefix_lst:
    name = f.split('/')[1]
    print(name)
    campaign_data_dict[name] = process_tmrc(f)
    print(campaign_data_dict[name])



TMRC14_APAC_1
(8844962, 1206149, 66251)
TMRC14_APAC_2
(34977662, 4429453, 274207)
TMRC15_APAC_3
(15545966, 2021695, 131046)


In [8]:
for f in twitterei_folder_prefix_lst:
    name = f.split('/')[1]
    print(name)
    df = getdf_twitterei(f)
    campaign_data_dict[name] = process_twitterei(df)
    print(campaign_data_dict[name])

china_082019_1


  df = pd.read_csv(csv_file)


(162553854, 22376563, 1898108)
china_082019_2


  df = pd.read_csv(csv_file)


(151516180, 22048287, 1701257)
china_082019_3


  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)


(198223929, 22934200, 2875334)
china_052020
(33048639, 1861761, 348608)
thailand_092020
(1937278, 126841, 21385)


In [11]:
for k,v in campaign_data_dict.items():
    print(k)
    print(v)

TMRC14_APAC_1
(8844962, 1206149, 66251)
TMRC14_APAC_2
(34977662, 4429453, 274207)
TMRC15_APAC_3
(15545966, 2021695, 131046)
china_082019_1
(162553854, 22376563, 1898108)
china_082019_2
(151516180, 22048287, 1701257)
china_082019_3
(198223929, 22934200, 2875334)
china_052020
(33048639, 1861761, 348608)
thailand_092020
(1937278, 126841, 21385)


In [10]:
tmrc_c = 0
tmrc_w = 0
tmrc_s = 0
twitterei_c = 0
twitterei_w = 0
twitterei_s = 0
for k,v in campaign_data_dict.items():
    if k.startswith("TMRC"):
        tmrc_c += v[0]
        tmrc_w += v[1]
        tmrc_s += v[2]
    else:
        twitterei_c += v[0]
        twitterei_w += v[1]
        twitterei_s += v[2]

print("TMRC")
print(f"{tmrc_c}, {tmrc_w}, {tmrc_s}")
print("TwitterEI")
print(f"{twitterei_c}, {twitterei_w}, {twitterei_s}")
print("ALL")
print(f"{tmrc_c + twitterei_c}, {tmrc_w + twitterei_w}, {tmrc_s + twitterei_s}")

TMRC
59368590, 7657297, 471504
TwitterEI
547279880, 69347652, 6844692
ALL
606648470, 77004949, 7316196
