In [1]:
# Libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import spacy
from spacy import displacy
import re

In [2]:
nlp = spacy.load('en_core_web_sm')

## Sample Cleaning

In [3]:
def remove_urls(text):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)

def remove_numeric(text):
    return re.sub(r'\d+', '', text) 

def remove_extra_spaces(text):

    lines = text.split("\n")
    non_empty_lines = [line.strip() for line in lines if line.strip() != ""]

    return (' '.join(non_empty_lines))

def remove_text_between_brackets(text):
    return re.sub("[\(\[].*?[\)\]]", "", text)
    # return re.sub(r'\([^)]*\)', '', text)

In [4]:
def get_sample_text_from_file():
        with open(os.path.join('..','Data','train','0725_20180410_nFCT10YSBS_1.txt')) as reader:
                text = reader.read()
                text = remove_extra_spaces(text)
                # text = remove_numeric(text)
                text = remove_urls(text)
                text = remove_text_between_brackets(text)

        return text

## Create training data using PhraseMatcher 

In [5]:
df_train = pd.read_csv(os.path.join('..','Data','train_labels.csv'))
df_train.head(10)

Unnamed: 0,doc_id,phrase,ric
0,0749_20180406_nL4N1RI40U_1,Romanian leu,RON
1,0749_20180406_nL4N1RI40U_1,Polish zloty,PLN
2,0749_20180406_nL4N1RI40U_1,Serbian dinar,RSD
3,0749_20180406_nL4N1RI40U_1,CZK,CZK
4,0749_20180406_nL4N1RI40U_1,CZK,CZK
5,0749_20180406_nL4N1RI40U_1,HUF,HUF
6,0749_20180406_nL4N1RI40U_1,dinar,RSD
7,0749_20180406_nL4N1RI40U_1,leu,RON
8,0749_20180406_nL4N1RI40U_1,forint,HUF
9,0749_20180406_nL4N1RI40U_1,zloty,PLN


In [6]:
df_lexicon = pd.read_csv(os.path.join('..','Data','lexicon.csv'))
df_lexicon.head(10)

Unnamed: 0,phrase,ric
0,leu,RON
1,Romanian leu,RON
2,Romania's leu,RON
3,Polish Zloty,PLN
4,zloty,PLN
5,Polish zloty,PLN
6,Zloty,PLN
7,Poland's currency,PLN
8,PLN,PLN
9,Serbian dinar,RSD


In [7]:
df_lexicon.query("ric=='CNY'")

Unnamed: 0,phrase,ric
102,Chinese yuan,CNY
103,yuan,CNY
104,Chinese currency 's,CNY
105,CNY,CNY
106,Chinese Yuan,CNY
107,Yuan,CNY
108,China's yuan,CNY
109,Chinese currency,CNY
110,Cny,CNY
111,China's yuan,CNY


In [8]:
df_lexicon.query("ric=='CAD'")

Unnamed: 0,phrase,ric
96,Canada's currency,CAD
97,CAD,CAD
98,Canada 's currency,CAD
99,loonie,CAD
100,Canadian dollar,CAD
101,Canadian Dollar,CAD


In [9]:
df_lexicon.query("ric=='USD'")

Unnamed: 0,phrase,ric
27,U.S. counterpart,USD
28,U.S. dlr,USD
29,US Dollar,USD
30,dollars,USD
31,U.S. currency,USD
32,cent,USD
33,U.S. dollar,USD
34,US cents,USD
35,dollar's,USD
36,USD,USD


In [10]:
# Group the lexicon by ric to get the phrases for each currency
df_all_phrases = df_lexicon.groupby('ric')['phrase'].agg(lambda x: list(x)).reset_index()

In [11]:
df_all_phrases.head()

Unnamed: 0,ric,phrase
0,AED,[UAE dirham]
1,AGN,"[Argentina's peso, Argentina peso]"
2,ALL,[all]
3,ARS,"[Argentine peso, Argentina’s peso]"
4,AUD,"[AUD, Aussie dollar, Australian, Australian do..."


In [12]:
# Now we will create patterns for matching texts in the corpus with the phrases for each currency type. 

In [13]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Create matcher for each currency
for index, row in df_all_phrases.iterrows():
    patterns = []

    # phrase_patterns = [ [ {'ORTH': phrase } , {'POS':{"IN": ["PROPN", "NOUN"]}} , {'DEP': {"NOT_IN": ["pobj"]}} ] for phrase in row['phrase'] ]
    phrase_patterns = [ [ {'ORTH': phrase , 'DEP': {"NOT_IN": ["POBJ"]},  'POS':{"IN": ["PROPN", "NOUN"]} } ]  for phrase in row['phrase'] ]
    # phrase_patterns.append({'POS':{"IN": ["PROPN", "NOUN"]}})
    # phrase_patterns['POS'] = {"IN": ["PROPN", "NOUN"]}
    # phrase_patterns['DEP'] = {"NOT_IN": ["POBJ"]}
    print(phrase_patterns)
    matcher.add(row['ric'],None,*phrase_patterns)

OS': {'IN': ['PROPN', 'NOUN']}}]]
[[{'ORTH': 'CHF', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Swiss francs', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'SWISS FRANC', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'franc', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Swiss currency', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Swiss franc', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Swiss Franc', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}]]
[[{'ORTH': 'Chile peso', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Chilean peso', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': "Chile's peso", 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'CLP', 'DEP': {'NOT_IN': ['POBJ']}, 'POS': {'IN': ['PROPN', 'NO

In [14]:
sample_text = 'The unilateral repudiation of the multilateral agreement with Iran and the threat by the US to impose sanctions of countries that continue to do business with Iran carries a much broader threat to the Euro and Euro pairs.'

# matcher = Matcher(nlp.vocab)

# test_pat = [{'ORTH': 'EURO'}], [{'ORTH': 'euro'}], [{'ORTH': 'Euro'},{'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Eur'}], [{'ORTH': 'EUR'}]
# test_pat2 = [[{'ORTH': 'EURO'}, {'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'euro'}, {'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Euro'}, {'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'Eur'}, {'POS': {'IN': ['PROPN', 'NOUN']}}], [{'ORTH': 'EUR'}, {'POS': {'IN': ['PROPN', 'NOUN']}}]]
# matcher.add('euro',None,*patterns)
doc = nlp(sample_text)

for match_id, start, end in matcher(doc):
    print(f"Matches for {nlp.vocab.strings[match_id]} in text : {doc[start:end]}")


Matches for USD in text : US
Matches for EUR in text : Euro
Matches for EUR in text : Euro


In [15]:
def parse_train_data(text):
    doc = nlp(text)
    detections = [(doc[start:end].start_char, doc[start:end].end_char, nlp.vocab.strings[idx] ) for idx, start, end in matcher(doc)]
    return (doc.text, {'entities': detections})

In [16]:
parse_train_data(sample_text)

('The unilateral repudiation of the multilateral agreement with Iran and the threat by the US to impose sanctions of countries that continue to do business with Iran carries a much broader threat to the Euro and Euro pairs.',
 {'entities': [(89, 91, 'USD'), (201, 205, 'EUR'), (210, 214, 'EUR')]})

In [18]:
sample_text = get_sample_text_from_file()
parse_train_data(sample_text)

iffs rather overshadowed what was a weaker than expected US payroll. USD/INR crept upwards thereafter, and saw minor attempts to break 65.000 handle as European session started, but settled between 64.9500-65.0000 at last look. 1M NDFs dipped to a 2-week low of 65.0100 before bouncing back to 65.1600 at last sight. Market Psychology USD/CNH - USD/CNH may move up again on renewed trade fears. Based on current situation, we still see some potential tests of the next support at 6.2740 on USD/CNH. For USD/CNY, further tests of the support at 6.2800 may come in, unless trade tensions escalate again. USD/SGD - We see key support at 1.3073 likely to hold as US-China trade tensions ratchet up. USD/IDR - Trade tariff rhetoric likely to weigh on sentiment which should keep the minor support at 13753 intact, and a break of 13800 remains key for bulls to take charge. USD/INR - The pair may test support at 64.720, but we do not see risk to strong support at 64.6200. 09 Apr 2018 08:21 GMT - Asia Dai

In [23]:
sample_text[1533:1538]

'Cable'