In [40]:
import numpy as np
from dataclasses import dataclass
import re
from pathlib import Path
import ijson
import pandas as pd
import textdistance

In [18]:
filename = "../data/twitter-data-small.json"

In [19]:
@dataclass
class json_handler:
    """
    JSON handler use to handle json input and obtain required information.
    """
    location_dict = {
        "Australian Capital Territory": "act",
        "New South Wales": "nsw",
        "Victoria": "vic",
        "Sydney": "syd",
        "Melbourne": "mel",
        "Hobart": "hob",
        "Brisbane": "bri",
        "Queensland": "qld",
        "Tasmania": "tas",
        
        " - ": " ",
        ", ": " "
    }
    
    @classmethod
    def location(self, string):
        location = string
        for key, value in self.location_dict.items():
            location = re.sub(key, value, location)
        
        # remove duplicate words
        location = re.sub(r'\b(\w+)\b\s+\b\1\b', r'\1', location)
            
        return location.lower().lstrip().rstrip()
    
    @classmethod
    def location_distance(text: str, target: str) -> str:
        score = textdistance.jaro_winkler(text, target)
        return score

In [20]:
@dataclass
class Twitter:
    _id: str = None
    author: str = None
    locat: str = None
        
    def __repr__(self):
        return f"{self._id} | {self.author} | {self.locat}"

In [30]:
FRIST_LINE, START = "item", "start_map"
TWITTER_ID = 'item._id'
AUTHOR_ID = "item.data.author_id"
LOCATION = "item.includes.places.item.full_name"
HAS_VALUE = "string"

In [33]:
def start_json_item(prefix: str, event: str) -> bool:
    return True if prefix == FRIST_LINE and event == START else False
    
def is_twitter_id(prefix: str, event: str, value: any) -> bool:
    return True if prefix == TWITTER_ID and event == HAS_VALUE and value is not None else False

def is_author_id(prefix: str, event: str, value: any) -> bool:
    return True if prefix == AUTHOR_ID and event == HAS_VALUE and value is not None else False

def is_location(prefix: str, event: str, value: any) -> bool:
    return True if prefix == LOCATION and event == HAS_VALUE and value is not None else False

In [34]:
with open(filename, "r") as f:
    parser = ijson.parse(f)
    current_chunk = []
        
    for prefix, event, value in ijson.parse(f):
        
        # if a json line start, initializing a twitter object
        if start_json_item(prefix, event):
            current_chunk.append(Twitter())
        
        elif is_twitter_id(prefix, event, value):
            current_chunk[-1]._id = np.int64(value)
        
        elif is_author_id(prefix, event, value):
            current_chunk[-1].author = np.int64(value)
        
        elif is_location(prefix, event, value):
            current_chunk[-1].locat = json_handler.location(value)
            
    jdf = pd.DataFrame([item.__dict__ for item in current_chunk])

In [35]:
jdf

Unnamed: 0,_id,author,locat
0,1412193387575316482,836119507173154816,australia
1,1412195752344883200,1399941819950006272,australia
2,1412189452361891840,3022979040,nsw australia
3,1412189999055790082,558259110,nsw australia
4,1412190244280012802,1158755742,nsw australia
...,...,...,...
710,1412198117932371969,1348502962050535428,canberra act
711,1412198454407794689,137315172,canberra act
712,1412185329184821253,3306424254,canberra act
713,1412190755452424209,7598552,braddon canberra


In [36]:
jdf.locat.unique()

array(['australia', 'nsw australia', 'central coast nsw',
       'nelson bay corlette nsw', 'eveleigh syd', 'kirribilli syd',
       'macquarie park syd', 'picton nsw', 'syd nsw', 'bathurst nsw',
       'byron bay nsw', 'cambewarra village nsw', 'coffs harbour nsw',
       'gloucester nsw', 'helensburgh nsw', 'kempsey nsw', 'manilla nsw',
       'murwillumbah', 'newcastle nsw', 'queanbeyan nsw', 'tamworth nsw',
       'toronto ontario', 'tweed heads nsw', 'whitton nsw',
       'wollongong nsw', 'torquay jan juc vic', 'vic australia',
       'shepparton mooroopna vic', 'gisborne vic', 'mel vic',
       'melton vic', 'pakenham vic', 'scoresby mel', 'sunbury vic',
       'sunshine mel', 'windsor mel', 'anglesea vic', 'ballarat vic',
       'bendigo vic', 'geelong vic', 'kilmore vic', 'mansfield vic',
       'wangaratta vic', 'warrnambool vic', 'gold coast qld',
       'sunshine coast qld', 'qld australia', 'rockhampton qld',
       'cairns qld', 'airlie beach cannonvale qld', 'toowoomba q

In [25]:
jdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     715 non-null    int64 
 1   author  715 non-null    int64 
 2   locat   715 non-null    object
dtypes: int64(2), object(1)
memory usage: 16.9+ KB


In [26]:
jdf.tail()

Unnamed: 0,_id,author,locat
710,1412198117932371969,1348502962050535428,canberra act
711,1412198454407794689,137315172,canberra act
712,1412185329184821253,3306424254,canberra act
713,1412190755452424209,7598552,braddon canberra
714,1412197569585848320,1397010827048194048,fyshwick canberra


In [27]:
20 * 1024 * 1024 / 2.08

10082461.538461538

In [28]:
17 / 720

0.02361111111111111

In [29]:
10082461 * 0.025 / 1024 

246.15383300781252