In [1]:
import pandas as pd
import numpy as np

import re

In [58]:
df = pd.read_csv("../data/processed/twitter-data-small.csv")
sal_df = pd.read_csv("../data/processed/sal.csv")

In [59]:
state_location = dict(zip([s.lower() for s in ['Australian Capital Territory', 
                                               'New South Wales', 
                                               'Northern Territory', 
                                               'Queensland', 
                                               'South Australia', 
                                               'Tasmania', 'Victoria', 
                                               'Western Australia']], 
                         [s.lower() for s in ['ACT', 'NSW', 
                                              'NT', 'QLD', 'SA', 
                                              'TAS', 'VIC', 'WA']]))

city_location = dict(zip([s.lower() for s in ['Canberra', 'Sydney', 'Darwin', 'Brisbane', 'Adelaide', 'Hobart', 'Melbourne', 'Perth']],
                         [s.lower() for s in ['CAN', 'SYD', 'DAR', 'BRI', 'ADE', 'HOB', 'MEL', 'PER']]))

In [60]:
state_location, city_location

({'australian capital territory': 'act',
  'new south wales': 'nsw',
  'northern territory': 'nt',
  'queensland': 'qld',
  'south australia': 'sa',
  'tasmania': 'tas',
  'victoria': 'vic',
  'western australia': 'wa'},
 {'canberra': 'can',
  'sydney': 'syd',
  'darwin': 'dar',
  'brisbane': 'bri',
  'adelaide': 'ade',
  'hobart': 'hob',
  'melbourne': 'mel',
  'perth': 'per'})

In [61]:
df.head()

Unnamed: 0.1,Unnamed: 0,_id,author,location,gcc
0,0,1412193949943402498,1400239891590520835,picton nsw,1gsyd
1,1,1412194010312019986,1400239891590520835,picton nsw,1gsyd
2,2,1412198081760694273,1400239891590520835,picton nsw,1gsyd
3,3,1412185070224351232,113000840,melton vic,2gmel
4,4,1412184979849678849,1199031211,perth wa,5gper


In [62]:
def normalise_location(location: str) -> str:
    text = location.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r' - ', '', text)
    
    for key, value in state_location.items():
        text = re.sub(key, value, text)

    return text

In [63]:
df.location = df.agg(lambda x: normalise_location(x.location), axis=1)

In [64]:
sal_df.head()

Unnamed: 0,location,gcc,location_x
0,abbotsbury,1gsyd,abbotsbury syd
1,abbotsford nsw,1gsyd,abbotsford nsw syd
2,acacia gardens,1gsyd,acacia gardens syd
3,agnes banks,1gsyd,agnes banks syd
4,airds,1gsyd,airds syd


In [65]:
import textdistance

In [66]:
string1 = df.location[0]

In [67]:
INVALID_LOCATION = ['act australia', 
                'nsw australia', 
                'nt australia', 
                'qld Australia', 
                'sa australia', 
                'tas australia', 'vic australia', 
                'wa australia', 'australia']

def is_state_location(location):
    if location in INVALID_LOCATION:
        return True
    return False

In [68]:
def location_similarity(location, sal_df, threshold = 0.9) -> float:
    df = sal_df[['gcc', 'location_x']].copy()
    df['similarity'] = df.location_x.apply(lambda x: textdistance.jaro_winkler(location, x)) 
    rdf = df[df.similarity > threshold]
    if rdf.empty:
        return np.nan
    return rdf.loc[df.similarity.idxmax()]['gcc']

In [69]:
pd.merge(df, sal_df, left_on='location', right_on='location')

Unnamed: 0.1,Unnamed: 0,_id,author,location,gcc_x,gcc_y,location_x
0,0,1412193949943402498,1400239891590520835,picton nsw,1gsyd,1gsyd,picton nsw syd
1,1,1412194010312019986,1400239891590520835,picton nsw,1gsyd,1gsyd,picton nsw syd
2,2,1412198081760694273,1400239891590520835,picton nsw,1gsyd,1gsyd,picton nsw syd
3,3,1412185070224351232,113000840,melton vic,2gmel,2gmel,melton vic mel
4,4,1412184979849678849,1199031211,perth wa,5gper,5gper,perth wa per
...,...,...,...,...,...,...,...
83,83,1412196248912760835,1351649162,canberra act,8acte,8acte,canberra act cte
84,84,1412197623805603840,1027167886148689920,canberra act,8acte,8acte,canberra act cte
85,85,1412198117932371969,1348502962050535428,canberra act,8acte,8acte,canberra act cte
86,86,1412198454407794689,137315172,canberra act,8acte,8acte,canberra act cte


In [84]:
pd.DataFrame(df.value_counts('author'), columns=['tweet_count']).reset_index()

Unnamed: 0,author,tweet_count
0,51378153,32
1,30839139,7
2,1199031211,4
3,1147292655673434112,4
4,14156860,3
5,1400239891590520835,3
6,1348502962050535428,2
7,157568648,2
8,1002522913235300352,2
9,3306424254,2


In [38]:
%%time
df['gcc'] = df.agg(lambda x: location_similarity(x.location, sal_df=sal_df), axis=1)

CPU times: user 51.9 s, sys: 6 µs, total: 51.9 s
Wall time: 51.9 s


In [34]:
sal_df

Unnamed: 0,location,gcc,location_x
0,abbotsbury,1gsyd,abbotsbury syd
1,abbotsford nsw,1gsyd,abbotsford nsw syd
2,acacia gardens,1gsyd,acacia gardens syd
3,agnes banks,1gsyd,agnes banks syd
4,airds,1gsyd,airds syd
...,...,...,...
3389,christmas island,9oter,christmas island ter
3390,home island,9oter,home island ter
3391,jervis bay,9oter,jervis bay ter
3392,norfolk island,9oter,norfolk island ter


In [39]:
df

Unnamed: 0.1,Unnamed: 0,_id,author,location,gcc
8,8,1412191471680118784,226175896,central coast nsw,
9,9,1412192672547147778,835809553589526530,central coast nsw,
10,10,1412192775479595008,1211985166886817796,central coast nsw,
11,11,1412193056258805762,1289467509364547584,central coast nsw,
17,17,1412196428869369856,1008365059381256196,central coast nsw,
...,...,...,...,...,...
710,710,1412198117932371969,1348502962050535428,canberra act,8acte
711,711,1412198454407794689,137315172,canberra act,8acte
712,712,1412185329184821253,3306424254,canberra act,8acte
713,713,1412190755452424209,7598552,braddon canberra,8acte


In [33]:
df

Unnamed: 0.1,Unnamed: 0,_id,author,location,gcc
8,8,1412191471680118784,226175896,central coast nsw,
9,9,1412192672547147778,835809553589526530,central coast nsw,
10,10,1412192775479595008,1211985166886817796,central coast nsw,
11,11,1412193056258805762,1289467509364547584,central coast nsw,
17,17,1412196428869369856,1008365059381256196,central coast nsw,
...,...,...,...,...,...
710,710,1412198117932371969,1348502962050535428,canberra act,8acte
711,711,1412198454407794689,137315172,canberra act,8acte
712,712,1412185329184821253,3306424254,canberra act,8acte
713,713,1412190755452424209,7598552,braddon canberra,


In [117]:
sal_df['similarity'] = sal_df.agg(lambda x: location_similarity(x.location, string1), axis=1)
sal_df

Unnamed: 0,location,ste,gcc,sal,similarity
0,abbotsbury,1,1gsyd,1,0.531481
1,abbotsford nsw,1,1gsyd,1,0.493386
2,acacia gardens,1,1gsyd,1,0.504233
3,agnes banks,1,1gsyd,1,0.535354
4,airds,1,1gsyd,1,0.533333
...,...,...,...,...,...
15335,christmas island,9,9oter,9,0.595569
15336,home island,9,9oter,9,0.519360
15337,jervis bay,9,9oter,9,0.448148
15338,norfolk island,9,9oter,9,0.493386


In [136]:
sal_df[sal_df.similarity > 0.98].empty

True

In [119]:
df.location[5]

'nsw australia'

In [120]:
is_state_location(df.location[5])

True

In [164]:
'Macquarie Park Syd'.lower() in list(sal_df.location)

False

In [175]:
'Darwin'.lower() in list(sal_df.location)

True

In [162]:
'central coast nsw' in list(sal_df.location)

False

In [190]:
sal_df[sal_df['location_x'].apply(lambda x: textdistance.jaro_winkler(x, 'darwin')) > 0.9]

Unnamed: 0,location,gcc,location_x
14915,darwin,7gdar,darwin dar


In [149]:
sal_df.location.unique()

array(['abbotsbury', 'abbotsford nsw', 'acacia gardens', ...,
       'jervis bay', 'norfolk island', 'west island'], dtype=object)

In [100]:
rdf1 = pd.read_csv("../data/result/task1.csv")

In [101]:
rdf1

Unnamed: 0,gcc,_id
0,1gsyd,4
1,2gmel,35
2,1gsyd,4
3,3gbri,6
4,4gade,12
5,5gper,1537
6,8acte,568
7,1gsyd,3
8,2gmel,38
9,3gbri,9


In [102]:
rdf1.groupby('gcc').sum().reset_index()

Unnamed: 0,gcc,_id
0,1gsyd,21
1,2gmel,139
2,3gbri,59
3,4gade,147
4,5gper,6959
5,6ghob,1
6,7gdar,1
7,8acte,2531


In [109]:
df[['author', 'gcc']].groupby(['author']).nunique().reset_index()

Unnamed: 0,author,gcc
0,14156860,1
1,15011950,1
2,30839139,1
3,42571871,1
4,51378153,1
5,51404228,1
6,74676960,1
7,113000840,1
8,137315172,1
9,157568648,1
