# Key Words
json, reading from file, json lines, dictionary, collections, counter, dictionary items, pandas counting, filling in mising values, horizontal bar plots

# Reading JSON Lines

In [13]:
%reset -f
import json

path="/home/ruslan/Desktop/Applied_Data_Science_With_Python/Python For Data Analysis/Chapter 2/bitly_usagov/bitly_usagov.txt"

# The file contains JSON lines.
# The file needs to be parsed line by line.
# Source 1: https://stackoverflow.com/questions/12451431/loading-and-parsing-a-json-file-with-multiple-json-objects
# Source 2: https://jsonlines.org/
records = [json.loads(line) for line in open(path)]

# Get timezones from the records.
# Note that not all records have the timezone field

list_time_zone = [rec['tz'] for rec in records if 'tz' in rec]                

# Dictionary of Time Zone Counts

In [14]:
# Count the number of time zones using a function
# Return a dictionary of timezone:count

def count_time_zones(list_time_zone):
    
    # Initialize a dictionary
    dict_time_zone_count = {}
    
    for i in list_time_zone:
        if i in dict_time_zone_count:
            
            # increment the count if the time zone is in the dictionary already
            dict_time_zone_count[i]+=1
        else:
            
            # if not in the dictionary, set the count to 1
            dict_time_zone_count[i]=1
            
    return dict_time_zone_count

dict_time_zone_count = count_time_zones(list_time_zone)

print(dict_time_zone_count)

{'America/New_York': 1251, 'America/Denver': 191, 'America/Sao_Paulo': 33, 'Europe/Warsaw': 16, '': 521, 'America/Los_Angeles': 382, 'Asia/Hong_Kong': 10, 'Europe/Rome': 27, 'Africa/Ceuta': 2, 'Europe/Madrid': 35, 'Asia/Kuala_Lumpur': 3, 'Asia/Nicosia': 1, 'Europe/London': 74, 'Pacific/Honolulu': 36, 'America/Chicago': 400, 'Europe/Malta': 2, 'Europe/Lisbon': 8, 'Europe/Paris': 14, 'Europe/Copenhagen': 5, 'America/Mazatlan': 1, 'Europe/Dublin': 3, 'Europe/Brussels': 4, 'America/Vancouver': 12, 'Europe/Amsterdam': 22, 'Europe/Prague': 10, 'Europe/Stockholm': 14, 'America/Anchorage': 5, 'Asia/Bangkok': 6, 'Europe/Berlin': 28, 'America/Rainy_River': 25, 'Europe/Budapest': 5, 'Asia/Tokyo': 37, 'Europe/Vienna': 6, 'America/Phoenix': 20, 'Asia/Jerusalem': 3, 'Asia/Karachi': 3, 'America/Bogota': 3, 'America/Indianapolis': 20, 'America/Montreal': 9, 'Asia/Calcutta': 9, 'Europe/Skopje': 1, 'Asia/Beirut': 4, 'Australia/NSW': 6, 'Chile/Continental': 6, 'America/Halifax': 4, 'America/Edmonton': 6,

# First n counts using Pure Python

In [15]:
# Get top n of the counts

def top_n(dict_time_zone_count,n=10):
    value_key_pairs = [(count,tz) for tz, count in dict_time_zone_count.items()]
    value_key_pairs.sort(reverse=True)
    return value_key_pairs[:10]


print(top_n(dict_time_zone_count,10))
    

[(1251, 'America/New_York'), (521, ''), (400, 'America/Chicago'), (382, 'America/Los_Angeles'), (191, 'America/Denver'), (74, 'Europe/London'), (37, 'Asia/Tokyo'), (36, 'Pacific/Honolulu'), (35, 'Europe/Madrid'), (33, 'America/Sao_Paulo')]


# First n Counts using Counter from Collections

In [16]:
# Use counter from Collections.Counter class

from collections import Counter

time_zone_counts = Counter(list_time_zone)

time_zone_counts.most_common(10)

[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]

# First n Counts using Pandas

In [17]:
import pandas as pd

frame_records = pd.DataFrame(records)

time_zone_counts = frame_records['tz'].value_counts()

time_zone_counts

America/New_York             1251
                              521
America/Chicago               400
America/Los_Angeles           382
America/Denver                191
                             ... 
America/St_Kitts                1
America/Tegucigalpa             1
Africa/Lusaka                   1
America/Argentina/Cordoba       1
America/La_Paz                  1
Name: tz, Length: 97, dtype: int64

# Filling the missing and NA values

In [18]:
clean_time_zone = frame_records['tz'].fillna('Missing')

clean_time_zone[clean_time_zone == ''] = 'Unknown'

time_zone_counts = clean_time_zone.value_counts()

time_zone_counts

America/New_York             1251
Unknown                       521
America/Chicago               400
America/Los_Angeles           382
America/Denver                191
                             ... 
Africa/Casablanca               1
America/Argentina/Cordoba       1
America/Monterrey               1
Europe/Volgograd                1
America/La_Paz                  1
Name: tz, Length: 98, dtype: int64