According to the assignment spec, you can ignore tweets that:
- have 'null' or 'undefined' ('und') language
- have no location information
- are made outside of the Grid.

In [55]:
import json
import pandas as pd
import numpy as np

# language code -> language name mapping

In [56]:
# https://datahub.io/core/language-codes

lang_map = pd.read_csv('./data/language-codes_csv.csv')
lang_map.loc[lang_map.shape[0]] = ['in', 'Indonesian']
lang_map.loc[lang_map.shape[0]] = ['iw', 'Former Hebrew']

lang_map = {code:name for code,name in zip(lang_map['alpha2'], lang_map['English'])}
lang_map

{'aa': 'Afar',
 'ab': 'Abkhazian',
 'ae': 'Avestan',
 'af': 'Afrikaans',
 'ak': 'Akan',
 'am': 'Amharic',
 'an': 'Aragonese',
 'ar': 'Arabic',
 'as': 'Assamese',
 'av': 'Avaric',
 'ay': 'Aymara',
 'az': 'Azerbaijani',
 'ba': 'Bashkir',
 'be': 'Belarusian',
 'bg': 'Bulgarian',
 'bh': 'Bihari languages',
 'bi': 'Bislama',
 'bm': 'Bambara',
 'bn': 'Bengali',
 'bo': 'Tibetan',
 'br': 'Breton',
 'bs': 'Bosnian',
 'ca': 'Catalan; Valencian',
 'ce': 'Chechen',
 'ch': 'Chamorro',
 'co': 'Corsican',
 'cr': 'Cree',
 'cs': 'Czech',
 'cu': 'Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic',
 'cv': 'Chuvash',
 'cy': 'Welsh',
 'da': 'Danish',
 'de': 'German',
 'dv': 'Divehi; Dhivehi; Maldivian',
 'dz': 'Dzongkha',
 'ee': 'Ewe',
 'el': 'Greek, Modern (1453-)',
 'en': 'English',
 'eo': 'Esperanto',
 'es': 'Spanish; Castilian',
 'et': 'Estonian',
 'eu': 'Basque',
 'fa': 'Persian',
 'ff': 'Fulah',
 'fi': 'Finnish',
 'fj': 'Fijian',
 'fo': 'Faroese',
 'fr': 'French',
 'fy'

# Open twitter json file

In [57]:
with open('./data/smallTwitter.json', 'r', encoding='utf-8') as f:
    small_twitter_data = json.load(f)

In [58]:
len(small_twitter_data['rows'])

4999

# extract key attributes

In [59]:
rows_dict = {'tweet_id': [], 
             'language': [],
#              'geo': [],
             'coordinates': []}

for i, tweet in enumerate(small_twitter_data['rows']):
    rows_dict['tweet_id'].append(tweet['id'])
    rows_dict['language'].append(tweet['doc']['metadata']['iso_language_code'])
#     rows_dict['geo'].append(tweet['doc']['geo'])
    rows_dict['coordinates'].append(tweet['doc']['coordinates'])
    
# coordiantes: (long, lat). this is what we want as it is in the same format as coordinates in sysGrid.json
# there is also 'geo' attribute in the json, which is in (lat,long) format

In [60]:
df_tweets = pd.DataFrame(rows_dict)

In [61]:
df_tweets.head()

Unnamed: 0,tweet_id,language,coordinates
0,1212161512334336000,es,
1,1212161512770437121,en,
2,1212161513361793024,en,
3,1212161513580105733,en,
4,1212161514112770048,en,


# cleaning

In [62]:
# drop any rows with None/nan language or coordinates
df_tweets = df_tweets.dropna(subset=['language', 'coordinates'])

# drop undefined or null language rows
df_tweets = df_tweets[df_tweets['language']!='und']
df_tweets = df_tweets[df_tweets['language']!='null']

In [63]:
df_tweets

Unnamed: 0,tweet_id,language,coordinates
227,1212162687049883648,en,"{'type': 'Point', 'coordinates': [151.211, -33..."
891,1212166639598395394,en,"{'type': 'Point', 'coordinates': [151.20797, -..."
1082,1212167754888953856,en,"{'type': 'Point', 'coordinates': [151.27053, -..."
1095,1212167790028705793,en,"{'type': 'Point', 'coordinates': [151.17834389..."
1301,1212168948503863296,en,"{'type': 'Point', 'coordinates': [151.072247, ..."
1339,1212169394958323712,en,"{'type': 'Point', 'coordinates': [151.20797, -..."
2093,1212173497167794180,en,"{'type': 'Point', 'coordinates': [151.0655, -3..."
2358,1212175152949022720,en,"{'type': 'Point', 'coordinates': [151.2102003,..."
3073,1212179185520627712,en,"{'type': 'Point', 'coordinates': [151.20797, -..."
3317,1212180379873898497,en,"{'type': 'Point', 'coordinates': [151.20797, -..."


In [64]:
# extract (long,lat) coordinates 
df_tweets['coordinates'] = df_tweets['coordinates'].apply(lambda x: x['coordinates'])
# map language code to language name. if language code is not in our mapping dict, replace it with None 
df_tweets['language'] = df_tweets['language'].apply(lambda x: lang_map[x] if x in lang_map else None)

# remove rows where language or coordinates is None.
df_tweets = df_tweets.dropna(subset=['language', 'coordinates'])

id, language, coordinates should be all key info we need... (don't even need tweet id maybe?)

In [65]:
df_tweets

Unnamed: 0,tweet_id,language,coordinates
227,1212162687049883648,English,"[151.211, -33.86]"
891,1212166639598395394,English,"[151.20797, -33.86751]"
1082,1212167754888953856,English,"[151.27053, -33.9005]"
1095,1212167790028705793,English,"[151.17834389, -33.93467816]"
1301,1212168948503863296,English,"[151.072247, -33.8473998]"
1339,1212169394958323712,English,"[151.20797, -33.86751]"
2093,1212173497167794180,English,"[151.0655, -33.81967]"
2358,1212175152949022720,English,"[151.2102003, -33.85999135]"
3073,1212179185520627712,English,"[151.20797, -33.86751]"
3317,1212180379873898497,English,"[151.20797, -33.86751]"


# CODE THAT DETERMINES THE CELL ID FOR EACH TWEET

## Hannan can fill this part in

My rough idea:

- for each row (these tweets all have both valid language and location info already) in `df_tweets`,
    - process the coordinates (e.g. [151.03, -33.88]) and determine which cell/grid-location id it should belong to, using the `sysGrid.json` file.
    
So after this section of code chunk, we will have another column named `cell` with values A1, A2, ..., D4.

# Count # of languages and # of tweets for each language, in each cell

Previous section should have added an extra column with cell id for each tweet.

I'll create a mock column for now and also some additional rows to add simulate other languages.

In [66]:
df_tweets.loc[df_tweets.shape[0]] = [1, 'Chinese', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'Chinese', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'Chinese', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'Chinese', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'German', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'German', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'German', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'French', [151.211, -33.86]]
df_tweets.loc[df_tweets.shape[0]] = [1, 'Malay', [151.211, -33.86]]

In [67]:
df_tweets

Unnamed: 0,tweet_id,language,coordinates
227,1212162687049883648,English,"[151.211, -33.86]"
891,1212166639598395394,English,"[151.20797, -33.86751]"
1082,1212167754888953856,English,"[151.27053, -33.9005]"
1095,1212167790028705793,English,"[151.17834389, -33.93467816]"
1301,1212168948503863296,English,"[151.072247, -33.8473998]"
1339,1212169394958323712,English,"[151.20797, -33.86751]"
2093,1212173497167794180,English,"[151.0655, -33.81967]"
2358,1212175152949022720,English,"[151.2102003, -33.85999135]"
3073,1212179185520627712,English,"[151.20797, -33.86751]"
3317,1212180379873898497,English,"[151.20797, -33.86751]"


In [68]:
df_tweets['cell'] = ['A1']*3+['D4']*5+['B3']*7+['B2']*9

In [69]:
df_tweets

Unnamed: 0,tweet_id,language,coordinates,cell
227,1212162687049883648,English,"[151.211, -33.86]",A1
891,1212166639598395394,English,"[151.20797, -33.86751]",A1
1082,1212167754888953856,English,"[151.27053, -33.9005]",A1
1095,1212167790028705793,English,"[151.17834389, -33.93467816]",D4
1301,1212168948503863296,English,"[151.072247, -33.8473998]",D4
1339,1212169394958323712,English,"[151.20797, -33.86751]",D4
2093,1212173497167794180,English,"[151.0655, -33.81967]",D4
2358,1212175152949022720,English,"[151.2102003, -33.85999135]",D4
3073,1212179185520627712,English,"[151.20797, -33.86751]",B3
3317,1212180379873898497,English,"[151.20797, -33.86751]",B3


In [84]:
# count # of languages in each cell
df_results = pd.DataFrame(df_tweets.groupby(['cell']).size()).reset_index().rename(columns={0:'#Total Tweets'})
df_results

Unnamed: 0,cell,#Total Tweets
0,A1,3
1,B2,9
2,B3,7
3,D4,5


In [143]:
# count # of occurences for each language in each cell
pd.DataFrame(df_tweets.groupby(['cell', 'language'], as_index=False).size())

Unnamed: 0,cell,language,size
0,A1,English,3
1,B2,Chinese,4
2,B2,French,1
3,B2,German,3
4,B2,Malay,1
5,B3,English,7
6,D4,English,5


# thoughts
- This notebook is quick prototype only, will need to move code into a script.
    - Feel free to criticize/change/suggest imporvements to any part of my code/thoughts.
    - Need to incorporate MPI style code after.
- Can probably use MPI to split the twitter dataset into evenly-sized chunks (e.g. if twitter data size is 1000 and we have 10 processors, each processor can process 100 rows/tweets)?
    - Need to research if 1-node, 8-cores and 2-nodes, 4-cores each need to be handled differently.
- Each processor can follow an identical procedure (shown in this notebook), and then at the end, we could aggregate all the final dataframe language counts into the format shown in the assignment spec?
    - Will just need to assign different section of the twitter dataset to each processor, and then aggregate at the end?

In [142]:
df_cell_lang_size = df_tweets.groupby(['cell', 'language'], as_index = False).size().astype('str')
df_cell_lang_size['lang-size'] = df_cell_lang_size['language'] + '-' + df_cell_lang_size['size']
df_cell_lang_size

Unnamed: 0,cell,language,size,lang-size
0,A1,English,3,English-3
1,B2,Chinese,4,Chinese-4
2,B2,French,1,French-1
3,B2,German,3,German-3
4,B2,Malay,1,Malay-1
5,B3,English,7,English-7
6,D4,English,5,English-5


In [144]:
pd.DataFrame(df_cell_lang_size.groupby(['cell'])['lang-size'].apply(','.join))

Unnamed: 0_level_0,lang-size
cell,Unnamed: 1_level_1
A1,English-3
B2,"Chinese-4,French-1,German-3,Malay-1"
B3,English-7
D4,English-5
