In [15]:
#
# Get Data from Countries REST API
# June 10th, 2019 by Ali Jabbari
#

In [16]:
import requests
import json
import pandas as pd

import re
import pickle
import csv

from pandas.io.json import json_normalize    

In [17]:
data_path = '/mnt/c/users/alija/Datasets/'

**Constructing Country Currencies (En/Fr) Database**

In [18]:
def get_info(fields):
    currencies_list = []
    country_list = []
    base_url = "https://restcountries.eu/rest/v2/all?fields="
    try:
        fields_str = ';'.join(fields)
        url = base_url + fields_str
        response = (requests.get(url).text)
        response_json = json.loads(response)
        for item in response_json:
            currencies_list.extend(item['currencies'])
            country_list.append(item['name'])
        return response_json
    
    except Exception as e:
        raise e

In [19]:
data = get_info(['name','currencies', 'alpha3Code', 'translations'])
df = json_normalize(data, meta=['name', 'alpha3Code', 'translations'], record_path=['currencies'], record_prefix='currency_')

df.head()

Unnamed: 0,currency_code,currency_name,currency_symbol,name,alpha3Code,translations
0,AFN,Afghan afghani,؋,Afghanistan,AFG,"{'br': 'Afeganistão', 'pt': 'Afeganistão', 'nl..."
1,EUR,Euro,€,Åland Islands,ALA,"{'br': 'Ilhas de Aland', 'pt': 'Ilhas de Aland..."
2,ALL,Albanian lek,L,Albania,ALB,"{'br': 'Albânia', 'pt': 'Albânia', 'nl': 'Alba..."
3,DZD,Algerian dinar,د.ج,Algeria,DZA,"{'br': 'Argélia', 'pt': 'Argélia', 'nl': 'Alge..."
4,USD,United State Dollar,$,American Samoa,ASM,"{'br': 'Samoa Americana', 'pt': 'Samoa America..."


In [20]:
df_trans = df['translations'].apply(pd.Series)
df = pd.concat([df, df_trans], axis = 1).drop('translations', axis = 1)

df1 = df[['alpha3Code', 'name', 'fr', 'currency_code', 'currency_symbol', 'currency_name']]
df1.rename(columns={'name':'country_name_en', 'fr':'country_name_fr', 'currency_name':'currency_name_en'}, inplace=True)
#df1['country_name_fr'] = df1['country_name_fr'].apply(lambda x: str(x)[:-1])

df1.head()

Unnamed: 0,alpha3Code,country_name_en,country_name_fr,currency_code,currency_symbol,currency_name_en
0,AFG,Afghanistan,Afghanistan,AFN,؋,Afghan afghani
1,ALA,Åland Islands,Åland,EUR,€,Euro
2,ALB,Albania,Albanie,ALL,L,Albanian lek
3,DZA,Algeria,Algérie,DZD,د.ج,Algerian dinar
4,ASM,American Samoa,Samoa américaines,USD,$,United State Dollar


In [21]:
df1[df1['country_name_en']=='Cook Islands']

Unnamed: 0,alpha3Code,country_name_en,country_name_fr,currency_code,currency_symbol,currency_name_en
61,COK,Cook Islands,Îles Cook,NZD,$,New Zealand dollar
62,COK,Cook Islands,Îles Cook,CKD,$,Cook Islands dollar


In [22]:
df2 = pd.read_csv('../../DATA/devises.csv', names=['county_name_fr', 'currency_name_fr', 'currency_code', 'code_num'], delimiter='\t')
df2['currency_code'] = df2['currency_code'].apply(lambda x: str(x)[:-1])
df2 = df2[['currency_code', 'currency_name_fr']]
df2.head(10)

Unnamed: 0,currency_code,currency_name_fr
0,AFN,Afghani
1,ZAR,Rand
2,ALL,Lek
3,DZD,Dinar algérien
4,EUR,Euro
5,EUR,Euro
6,AOA,Kwanza
7,XCD,Dollar des Caraïbes orientales
8,na,Pas de devise universelle
9,XCD,Dollar des Caraïbes orientales


In [23]:
result = pd.merge(df1, df2, on='currency_code')
result = result.drop_duplicates().reset_index()
result.sort_values(by ='alpha3Code')

Unnamed: 0,index,alpha3Code,country_name_en,country_name_fr,currency_code,currency_symbol,currency_name_en,currency_name_fr
87,1811,ABW,Aruba,Aruba,AWG,ƒ,Aruban florin,Florin arubais
0,0,AFG,Afghanistan,Afghanistan,AFN,؋,Afghan afghani,Afghani
60,1644,AGO,Angola,Angola,AOA,Kz,Angolan kwanza,Kwanza
61,1645,AIA,Anguilla,Anguilla,XCD,$,East Caribbean dollar,Dollar des Caraïbes orientales
1,1,ALA,Åland Islands,Åland,EUR,€,Euro,Euro
37,1261,ALB,Albania,Albanie,ALL,L,Albanian lek,Lek
2,36,AND,Andorra,Andorre,EUR,€,Euro,Euro
261,2124,ARE,United Arab Emirates,Émirats arabes unis,AED,د.إ,United Arab Emirates dirham,Dirham UAE
85,1809,ARG,Argentina,Argentine,ARS,$,Argentine peso,Peso Argentin
86,1810,ARM,Armenia,Arménie,AMD,,Armenian dram,Dram Armenien


In [24]:
out_df = result[['alpha3Code','country_name_en','country_name_fr','currency_code','currency_symbol','currency_name_en','currency_name_fr']]
out_df.head()

Unnamed: 0,alpha3Code,country_name_en,country_name_fr,currency_code,currency_symbol,currency_name_en,currency_name_fr
0,AFG,Afghanistan,Afghanistan,AFN,؋,Afghan afghani,Afghani
1,ALA,Åland Islands,Åland,EUR,€,Euro,Euro
2,AND,Andorra,Andorre,EUR,€,Euro,Euro
3,AUT,Austria,Autriche,EUR,€,Euro,Euro
4,BEL,Belgium,Belgique,EUR,€,Euro,Euro


In [25]:
out_df.to_csv('currencies_en_fr.csv', index=False)

** Keywords population **

In [26]:
df = out_df
cur_set = set(df['currency_name_fr'])

In [27]:
cur_list = [str(x)[:-1] for x in cur_set]
cur_alias = ['Dollar américain', 'Renminbi', 'monnaie unique européenne', 'monnaie européenne']
cur_list += cur_alias

In [28]:
prefix = ['Nouveau']
cur_roots = []
suffix = []
for cur in cur_list:
    kw = cur.split()
    if kw[0] in prefix:
        cur_roots.append(kw[1])
        suffix.append(' '.join(kw[2:]))
    else:
        cur_roots.append(kw[0])
        suffix.append(' '.join(kw[1:]))

In [29]:
roots_vars = [''.join([cur,'s']) for cur in cur_roots]
suffix_vars = [sx if sx.startswith(("des ","d'",'de ', 'du ')) or sx.endswith(('s','x')) or sx.isupper() or len(sx)==0 else ''.join([sx, 's']) for sx in suffix]
cur_pl = [' '.join([a,b]) if len(b) > 0 else ''.join([a,b]) for a,b in zip(roots_vars, suffix_vars)]

In [30]:
cur_vars = cur_list + cur_pl + cur_roots + roots_vars
cur_vars.sort()

In [31]:
cur_codes = list(set(df['currency_code']))
cur_symbs = list(set(df['currency_symbol']))

**Load/Extract Sentences in Text Corpus**

In [32]:
# split text into sentences
def extractSentences(txtsrc):
    sentences =[]

    with open(txtsrc, "rb") as infile:
        text = pickle.load(infile)

    for item in text:
        sentences.extend(re.split(r' *[\.\?!][\'"\)\]]* *', item))

    with open("allsents.txt", "wb") as outfile:
        pickle.dump(sentences, outfile)
    
    return sentences

# Load sentences from file (e.g. 'allsents.txt')
def loadPickled(pickle_file):
    with open(pickle_file, 'rb') as fp:
        array = pickle.load(fp)
    return array

# Find corresponding elements in list
def findAll(lst, predicate):
    return [i for i in lst if predicate(i)]

#  Examine the generated annotations
def examAnns(anns):
    for item in anns:
        print(item[0], ':', [item[0][a:b] for (a,b,c) in item[1]['entities']])

In [33]:
%%time

sentences = loadPickled(data_path + 'allsents.txt')

CPU times: user 19.2 s, sys: 44.6 s, total: 1min 3s
Wall time: 1min 4s


In [34]:
sentences = [x for x in sentences if x]

** Find Mentions in sentences **

In [35]:
import annotator
cur_codes = [' USD ', ' GBP ', ' EUR ', ' CHF ', ' CNY ', ' KR ', ' JPY ', ' AUD ', ' CAD ']
cur_symbs = ['$', '€', '¥', '£']
curs = [' '+s.upper()+' ' for s in cur_vars] # spaces to avoid words containing currency related terms
keywords = set(cur_codes + cur_symbs + curs)

In [None]:
keywords

In [None]:
# A duré 2 jours avec pickle 15-17 juin

label = 'CUR'
annotations = annotator.annotateText(sentences, keywords, label)

In [None]:
%%time
with open(data_path +label+ "_annotations.txt", "wb") as outfile:
    pickle.dump(annotations, outfile)

In [None]:
# CUR_annotations = loadPickled(data_path+'CUR_annotations.txt')