# Parse raw json tweet files
Use preprocessor: https://github.com/s/preprocessor

In [None]:
!pip install tweet-preprocessor &> /dev/null

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls "drive/Shareddrives/AV_Twitter_Project/Topic Modeling/data"

'History of Automated Vehicle Technology - History of Automated Vehicle Technology.csv'
 Link_Events_10_day
'old data'
 opinion_modeling


In [None]:
import os
import glob
from datetime import datetime
import json
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import preprocessor as p

In [None]:
cwd = "drive/Shareddrives/AV_Twitter_Project/Topic Modeling"

In [None]:
# show files in dir
data_path = os.path.join(
    cwd, "data/Link_Events_10_day/raw_json/text",
)
file_paths = glob.glob(
    os.path.join(cwd, "data/Link_Events_10_day/raw_json/text", "*.json")
)
file_path = file_paths[0]

for i, path in enumerate(file_paths):
    print(f"{i}: {os.path.basename(path)}")

0: confused_tesla_incident_in_february.json
1: google_lexus_suv.json
2: introduction_of_tesla_autopilot.json
3: las_vegas_autonomous_shuttle_crash.json
4: march_1st_2019_highway_crash_fatality.json
5: tesla_model_s_fatal_crash_in_china.json
6: tesla_model_x_fatality.json
7: mcity_driverless_shuttle_launches_on_um_north_campus.json
8: uber_self_driving_volvo_crash.json
9: world_first_production_car_to_offer_level_3_automation.json
10: university_of_michigan_mcity.json
11: first_automated_vehicle_crash_fatality_takes_place.json


In [None]:
def remove_newline(text):
    return text.replace("\n", " \n")

def match_to_list(match):
    """ convert preprocessor match to list of strings """
    out = []
    if match is not None:
        for i in range(len(match)):
            out.append(match[i].match)
    return out

def process_item(item):
    text = item[0]
    receiver = item[1]
    author = item[2]
    date = datetime.strptime(item[3], "%Y-%m-%d %H:%M:%S")
    depth = item[4].replace("depth = ", "")
    parent_id = item[5]

    text = remove_newline(text)
    parsed_text = p.parse(text)

    emojis = ", ".join(match_to_list(parsed_text.emojis))
    hashtags = ", ".join(match_to_list(parsed_text.hashtags))
    mentions = ", ".join(match_to_list(parsed_text.mentions))
    numbers = ", ".join(match_to_list(parsed_text.numbers))
    reserved = ", ".join(match_to_list(parsed_text.reserved))
    smileys = ", ".join(match_to_list(parsed_text.smileys))
    urls = ", ".join(match_to_list(parsed_text.urls))

    tokens = p.tokenize(text)
    out = {
        "parent_id": parent_id,
        "text": text,
        "tokens": tokens,
        "author": author,
        "receiver": receiver,
        "date": date,
        "depth": depth,
        "emojis": emojis,
        "hashtags": hashtags,
        "mentions": mentions,
        "numbers": numbers,
        "reserved": reserved,
        "smileys": smileys,
        "urls": urls
    }

    return out

def process_json(raw_json):
    out = []
    counter = 0
    for key, item in raw_json.items():
        item_dict = {"tweet_id": key}
        processed_item = process_item(item)
        item_dict.update(processed_item)
        out.append(item_dict) 
        
    return out

In [None]:
# process all json files
save_path = os.path.join(cwd, "data/opinion_modeling", "raw_csv")
if not os.path.exists(save_path):
    os.mkdir(save_path)

for i, file_path in enumerate(file_paths):
    file_name = os.path.basename(file_path).replace(".json", ".csv")
    print(f"processing {i}: {file_name}")

    with open(file_path) as f:
        raw_json = json.load(f)

    processed_data = process_json(raw_json)
    df_data = pd.DataFrame(processed_data)

    df_data.to_csv(os.path.join(save_path, file_name), index=False) 

processing 0: confused_tesla_incident_in_february.csv
processing 1: mcity_driverless_shuttle_launches_on_um_north_campus.csv
processing 2: las_vegas_autonomous_shuttle_crash.csv
processing 3: introduction_of_tesla_autopilot.csv
processing 4: university_of_michigan_mcity_.csv
processing 5: world_first_production_car_to_offer_level_3_automation.csv
processing 6: google_lexus_suv.csv
processing 7: tesla_model_x_fatality.csv
processing 8: march_1st_2019_highway_crash_fatality.csv
processing 9: tesla_model_s_fatal_crash_in_china.csv
processing 10: first_automated_vehicle_crash_fatality__takes_place.csv
processing 11: uber_self_driving_volvo_crash.csv
