In [55]:
import time
from shapely.geometry import shape, MultiPolygon, Point, Polygon
import geopandas as gpd
import json
import pandas as pd

import util

In [56]:
# time
start_time = time.time()

# https://datahub.io/core/language-codes

In [57]:
# get code:language mapping dict
lang_map = util.get_language_code_map('./data/language-codes_csv.csv')

In [58]:
lang_map

{'aa': 'Afar',
 'ab': 'Abkhazian',
 'ae': 'Avestan',
 'af': 'Afrikaans',
 'ak': 'Akan',
 'am': 'Amharic',
 'an': 'Aragonese',
 'ar': 'Arabic',
 'as': 'Assamese',
 'av': 'Avaric',
 'ay': 'Aymara',
 'az': 'Azerbaijani',
 'ba': 'Bashkir',
 'be': 'Belarusian',
 'bg': 'Bulgarian',
 'bh': 'Bihari languages',
 'bi': 'Bislama',
 'bm': 'Bambara',
 'bn': 'Bengali',
 'bo': 'Tibetan',
 'br': 'Breton',
 'bs': 'Bosnian',
 'ca': 'Catalan; Valencian',
 'ce': 'Chechen',
 'ch': 'Chamorro',
 'co': 'Corsican',
 'cr': 'Cree',
 'cs': 'Czech',
 'cu': 'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic',
 'cv': 'Chuvash',
 'cy': 'Welsh',
 'da': 'Danish',
 'de': 'German',
 'dv': 'Divehi; Dhivehi; Maldivian',
 'dz': 'Dzongkha',
 'ee': 'Ewe',
 'el': 'Greek, Modern (1453-)',
 'en': 'English',
 'eo': 'Esperanto',
 'es': 'Spanish; Castilian',
 'et': 'Estonian',
 'eu': 'Basque',
 'fa': 'Persian',
 'ff': 'Fulah',
 'fi': 'Finnish',
 'fj': 'Fijian',
 'fo': 'Faroese',
 'fr': 'French',
 'fy'

In [59]:
#########################################
# TO BE FILLED IN / REPLACED WITH STREAM/CHUNK PROCESSING?
# this code block will change in multi-core setting
# maybe output format won't be a dictionary in multi-core setting
# can change accordingly once json stream/chunk parsing code is completed
with open('./data/smallTwitter.json', 'r', encoding='utf-8') as f:
    twitter_data = json.load(f)

########################################

In [60]:
# twitter_data

In [61]:
# extract key info from given dataset
# (may be changed little depending on how json chunk is given
# for the processor from code above)
# will probably write it as a function once input format is
# finalized
rows_dict = {'tweet_id': [],
             'language': [],
             'coordinates': []}
for i, tweet in enumerate(twitter_data['rows']):
    rows_dict['tweet_id'].append(tweet['id'])
    rows_dict['language'].append(tweet['doc']['metadata']['iso_language_code'])
    rows_dict['coordinates'].append(tweet['doc']['coordinates'])
df_tweets = pd.DataFrame(rows_dict)
# output format should be a dataframe for the above code chunk

# clean
df_tweets = util.process_df(df_tweets, lang_map)

# insert artificial row with inivalid language code
# df_tweets.loc[df_tweets.shape[0]] = ['temp', 'zz', {
#     'type': 'Point', 'coordinates': [151.211, 33]}]

unmathced = util.count_unmatched(df_tweets)

In [62]:
df_tweets

Unnamed: 0,tweet_id,language,coordinates
227,1212162687049883648,English,"[151.211, -33.86]"
891,1212166639598395394,English,"[151.20797, -33.86751]"
1082,1212167754888953856,English,"[151.27053, -33.9005]"
1095,1212167790028705793,English,"[151.17834389, -33.93467816]"
1301,1212168948503863296,English,"[151.072247, -33.8473998]"
1339,1212169394958323712,English,"[151.20797, -33.86751]"
2093,1212173497167794180,English,"[151.0655, -33.81967]"
2358,1212175152949022720,English,"[151.2102003, -33.85999135]"
3073,1212179185520627712,English,"[151.20797, -33.86751]"
3317,1212180379873898497,English,"[151.20797, -33.86751]"


In [63]:
unmathced

defaultdict(int, {})

In [64]:
############################################################
# TO BE CHANGED.
# EITHER RESOLVE BOUNDARY ISSUES BY KEEPING GEOPANDAS APPROACH,
# OR GO WITH FOR LOOP APPROACH (WHICH IS EASIER PROBABLY?)
# GRID LOCATION SHOULD BE APPENDED AS AN EXTRA COLUMN TO 'df_tweets'.
# KEEP THE DF NAME THE SAME FOR CONVENIENCE (tweets_with_cells) (doesn't really matter)
# also handy to keep 'cells_id' column name the same.

# tweet-grid matching
with open('./data/sydGrid.json', 'r', encoding='utf-8') as f:
    syd_grid = json.load(f)

syd_grid_coorindates = []
syd_grid_id = []
for features in syd_grid['features']:
    poly = Polygon(features['geometry']['coordinates'][0])
    syd_grid_coorindates += [poly]
    syd_grid_id.append(features['properties']['id'])

geodata = gpd.GeoDataFrame()
geodata['cells_id'] = syd_grid_id
geodata['geometry'] = syd_grid_coorindates
coords = [Point(xy) for xy in df_tweets['coordinates']]
gdf_locations = gpd.GeoDataFrame(df_tweets, geometry=coords)
# that requires rtree or pygeos package and can be installed using pip. rtree is not working for some reason, pygeos
# work but gives out compatibility issues warnings with shapely packge. On windows, shapely was installed
# indepenedenly to install geopandas. In linux env geopandas and all its dependencies will be installed using either
# conda and pip and therefore this compatibility issue will be resolved. for more information, visit
# https://github.com/geopandas/geopandas/issues/2355
tweets_with_cells = gpd.sjoin(
    gdf_locations, geodata, how='left', predicate='within')
tweets_with_cells = tweets_with_cells.dropna(subset=['cells_id'])
############################################################

In [65]:
tweets_with_cells

Unnamed: 0,tweet_id,language,coordinates,geometry,index_right,cells_id
227,1212162687049883648,English,"[151.211, -33.86]",POINT (151.21100 -33.86000),4.0,19.0
891,1212166639598395394,English,"[151.20797, -33.86751]",POINT (151.20797 -33.86751),4.0,19.0
1082,1212167754888953856,English,"[151.27053, -33.9005]",POINT (151.27053 -33.90050),0.0,23.0
1095,1212167790028705793,English,"[151.17834389, -33.93467816]",POINT (151.17834 -33.93468),4.0,19.0
1301,1212168948503863296,English,"[151.072247, -33.8473998]",POINT (151.07225 -33.84740),5.0,18.0
1339,1212169394958323712,English,"[151.20797, -33.86751]",POINT (151.20797 -33.86751),4.0,19.0
2358,1212175152949022720,English,"[151.2102003, -33.85999135]",POINT (151.21020 -33.85999),4.0,19.0
3073,1212179185520627712,English,"[151.20797, -33.86751]",POINT (151.20797 -33.86751),4.0,19.0
3317,1212180379873898497,English,"[151.20797, -33.86751]",POINT (151.20797 -33.86751),4.0,19.0
3885,1212183787771170817,English,"[151.0355168, -33.8392224]",POINT (151.03552 -33.83922),9.0,14.0


In [66]:
# final formatting
# count # of languages in each cell
df_total_tweets = util.count_total_tweets(tweets_with_cells)

# count # of occurences for each language in each cell
df_language_counts = util.count_language_counts(tweets_with_cells)

# flatten language counts for each cell
lang_counts = util.flatten_language_counts(df_language_counts)

# make df with top10 column with format shown in the ass. spec.
df_top10 = util.df_format_top_10(lang_counts)

# final output df format
df_final = df_total_tweets.merge(df_top10, on='cells_id')
# df_final.to_csv('./results/output.csv', index=False)

In [67]:
df_total_tweets

Unnamed: 0,cells_id,#Total Tweets
0,14.0,1
1,18.0,2
2,19.0,10
3,23.0,1


In [68]:
df_language_counts

Unnamed: 0,cells_id,language,size
0,14.0,English,1
1,18.0,English,2
2,19.0,English,10
3,23.0,English,1


In [69]:
lang_counts

{14.0: [[('English', 1)]],
 18.0: [[('English', 2)]],
 19.0: [[('English', 10)]],
 23.0: [[('English', 1)]]}

In [70]:
df_top10

Unnamed: 0,cells_id,Top 10 languages & tweets
0,14.0,"[(English, 1)]"
1,18.0,"[(English, 2)]"
2,19.0,"[(English, 10)]"
3,23.0,"[(English, 1)]"


In [71]:
df_final

Unnamed: 0,cells_id,#Total Tweets,Top 10 languages & tweets
0,14.0,1,"[(English, 1)]"
1,18.0,2,"[(English, 2)]"
2,19.0,10,"[(English, 10)]"
3,23.0,1,"[(English, 1)]"


In [72]:
end_time = time.time() - start_time
print(f'{end_time} secs')
## faster when run as a script. 0.5-0.7 secs

1.660745620727539 secs
