# Process data into preferred format
* NER Dataset 
* WNUT17 datast from CorNLL 2017

## Final format of training data
* In a text file
* Each line is a pair of 2 sentence: seqeucen of words - sequence of tags

## import dependencies

In [16]:
import os
import re
import pandas as pd
import numpy as np

### NER Dataset

In [2]:
data = pd.read_csv('./ner_dataset.csv', encoding = 'latin1')
data = data.fillna(method="ffill")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
tags = list(set(data["Tag"].values))
n_tags = len(tags);
print("Tags: ", tags)

Tags:  ['I-eve', 'I-tim', 'I-art', 'B-org', 'B-art', 'B-geo', 'O', 'B-gpe', 'B-per', 'I-org', 'I-per', 'I-geo', 'B-nat', 'I-gpe', 'B-eve', 'B-tim', 'I-nat']


In [4]:
data['Tag'] = data['Tag'].apply(lambda x : re.sub('gpe', 'geo', x) if 'gpe' in x else x)

In [5]:
agg_func = lambda input : '\t'.join([' '.join(input['Word'].values.tolist()), ' '.join(input['Tag'].values.tolist())])
grouped_data = data.groupby('Sentence #').apply(agg_func).reset_index().rename(columns = {0 : 'Sentence'})['Sentence']

In [7]:
grouped_data[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .\tO O O O O O B-geo O O O O O B-geo O O O O O B-geo O O O O O'

In [8]:
with open('ner_dataset.txt', 'w') as file:
    file.writelines('\n'.join(grouped_data))

### WNUT 17 Dataset

In [9]:
file = os.path.join(os.getcwd(), 'emerging_entities_17', 'wnut17train.conll')

In [10]:
with open(file, 'r') as f:
    df = f.read()
    df = df.split('\n\t\n') # split pargraph
    df = [d.split('\n') for d in df] # tokenize
    df = [[(w.split('\t')[0], w.split('\t')[-1]) for w in d] for d in df] # split word and tag

In [11]:
def tag_edit(input):
    if 'group' in input:
        return re.sub('group', 'org', input)
    elif  'corporation' in input:
        return re.sub('corporation', 'org', input)
    elif 'location' in input:
        return re.sub('location', 'geo', input)
    elif 'product' in input:
        return re.sub('product', 'art', input)
    elif 'creative-work' in input:
        return re.sub('creative-work', 'art', input)
    elif 'person' in input:
        return re.sub('person', 'per', input)
    return input
df = [[(w[0], tag_edit(w[-1])) for w in d] for d in df]

In [12]:
df

[[('@paulwalk', 'O'),
  ('It', 'O'),
  ("'s", 'O'),
  ('the', 'O'),
  ('view', 'O'),
  ('from', 'O'),
  ('where', 'O'),
  ('I', 'O'),
  ("'m", 'O'),
  ('living', 'O'),
  ('for', 'O'),
  ('two', 'O'),
  ('weeks', 'O'),
  ('.', 'O'),
  ('Empire', 'B-geo'),
  ('State', 'I-geo'),
  ('Building', 'I-geo'),
  ('=', 'O'),
  ('ESB', 'B-geo'),
  ('.', 'O'),
  ('Pretty', 'O'),
  ('bad', 'O'),
  ('storm', 'O'),
  ('here', 'O'),
  ('last', 'O'),
  ('evening', 'O'),
  ('.', 'O')],
 [('From', 'O'),
  ('Green', 'O'),
  ('Newsfeed', 'O'),
  (':', 'O'),
  ('AHFA', 'B-org'),
  ('extends', 'O'),
  ('deadline', 'O'),
  ('for', 'O'),
  ('Sage', 'O'),
  ('Award', 'O'),
  ('to', 'O'),
  ('Nov', 'O'),
  ('.', 'O'),
  ('5', 'O'),
  ('http://tinyurl.com/24agj38', 'O')],
 [('Pxleyes', 'B-org'),
  ('Top', 'O'),
  ('50', 'O'),
  ('Photography', 'O'),
  ('Contest', 'O'),
  ('Pictures', 'O'),
  ('of', 'O'),
  ('August', 'O'),
  ('2010', 'O'),
  ('...', 'O'),
  ('http://bit.ly/bgCyZ0', 'O'),
  ('#photography', 'O')],


In [13]:
written = ['\t'.join([' '.join([w[0] for w in d]), ' '.join([w[-1] for w in d])]) for d in df]

In [14]:
written

["@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening .\tO O O O O O O O O O O O O O B-geo I-geo I-geo O B-geo O O O O O O O O",
 'From Green Newsfeed : AHFA extends deadline for Sage Award to Nov . 5 http://tinyurl.com/24agj38\tO O O O B-org O O O O O O O O O O',
 'Pxleyes Top 50 Photography Contest Pictures of August 2010 ... http://bit.ly/bgCyZ0 #photography\tB-org O O O O O O O O O O O',
 'today is my last day at the office .\tO O O O O O O O O',
 "4Dbling 's place til monday , party party party . &lt; 3\tB-per O O O O O O O O O O O",
 "watching the VMA pre-show again lol it was n't even a good show the first time ... so bored !\tO O B-art O O O O O O O O O O O O O O O O O",
 '27 followers ! 30 followers is my goal for today !\tO O O O O O O O O O O',
 "This is the 2nd hospital ive been in today , but ive just seen a doctor who was an older version of justin :' )\tO O O O O O O O O O O O O O O O O O O O O 

In [15]:
with open('wnut17train_conll.txt', 'w') as file:
    file.writelines('\n'.join([str(d) for d in written]))