# Data Processing
### Process data into preferred format
* NER Dataset 
* WNUT17 datast from CorNLL 2017

### Retrieve list of word and tags
* Words is stored as a list of words in a text file
* Tags is stored as a list of tags in a text file

## Final format of training data
* In a text file
* Each line is a pair of 2 sentence: seqeucen of words - sequence of tags

## import dependencies

In [1]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### NER Dataset

In [2]:
data = pd.read_csv('./data/ner_dataset.csv', encoding = 'latin1')
data = data.fillna(method="ffill")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
# inspect gags
tags = list(set(data["Tag"].values))
n_tags = len(tags);
print("Tags: ", tags)

Tags:  ['B-geo', 'I-art', 'I-tim', 'B-per', 'O', 'I-per', 'B-tim', 'I-nat', 'B-eve', 'I-geo', 'B-gpe', 'B-art', 'I-gpe', 'I-eve', 'B-org', 'I-org', 'B-nat']


In [4]:
# simplify tags to match with WNUT 17 dataset
data['Tag'] = data['Tag'].apply(lambda x : re.sub('gpe', 'geo', x) if 'gpe' in x else x)

In [5]:
# concatenate sentences together
agg_func = lambda input : [' '.join(input['Word'].values.tolist()), ' '.join(input['Tag'].values.tolist())]
grouped_data = data.groupby('Sentence #').apply(agg_func).reset_index().rename(columns = {0 : 'Sentence'})['Sentence']

**Split train and test sets**

In [22]:
train_set, val_set = train_test_split(grouped_data, test_size = 0.2)

**Save data**

In [13]:
# save training words and labels
with open('ner_train_text.txt', 'w') as file:
    file.writelines('\n'.join([data[0] for data in train_set]))
with open('ner_val_label.txt', 'w') as file:
    file.writelines('\n'.join([data[1] for data in train_set]))
    
# save testing words and labels
with open('ner_val_text.txt', 'w') as file:
    file.writelines('\n'.join([data[0] for data in val_set]))
with open('ner_label_dataset.txt', 'w') as file:
    file.writelines('\n'.join([data[1] for data in val_set]))

**Retrive list of words**

In [6]:
# retrieve and lower words and tags
words = list(set([w.lower() for w in data['Word']]))
tags = list(set([t.lower() for t in data['Tag']]))

### WNUT 17 Dataset

In [7]:
# path to data
file = os.path.join(os.getcwd(), 'emerging_entities_17', 'wnut17train.conll')

In [8]:
# read data
with open(file, 'r') as f:
    df = f.read()
    df = df.split('\n\t\n') # split pargraph
    df = [d.split('\n') for d in df] # tokenize
    df = [[(w.split('\t')[0], w.split('\t')[-1]) for w in d] for d in df] # split word and tag

In [9]:
# simplify tags to match with NER dataset tags
def tag_edit(input):
    if 'group' in input:
        return re.sub('group', 'org', input)
    elif  'corporation' in input:
        return re.sub('corporation', 'org', input)
    elif 'location' in input:
        return re.sub('location', 'geo', input)
    elif 'product' in input:
        return re.sub('product', 'art', input)
    elif 'creative-work' in input:
        return re.sub('creative-work', 'art', input)
    elif 'person' in input:
        return re.sub('person', 'per', input)
    return input
df = [[(w[0], tag_edit(w[-1])) for w in d] for d in df]

In [10]:
# concatenate sentences
written = [[' '.join([w[0] for w in d]), ' '.join([w[-1] for w in d])] for d in df]

**Split train and test sets**

In [20]:
train_set, val_set = train_test_split(written, test_size = 0.2)

**Save data**

In [21]:
# save training words and tags
with open('wnut17train_conll_text.txt', 'w') as file:
    file.writelines('\n'.join([d[0] for d in train_set]))
with open('wnut17train_conll_target.txt', 'w') as file:
    file.writelines('\n'.join([d[-1] for d in train_set]))
    
# save testing words and tags
with open('wnut17train_conll_text.txt', 'w') as file:
    file.writelines('\n'.join([d[0] for d in val_set]))
with open('wnut17train_conll_target.txt', 'w') as file:
    file.writelines('\n'.join([d[-1] for d in val_set]))

**Retrieve words and tags**

In [11]:
# retrieve wprds amd tags
text = ' '.join([w[0] for w in written])
text = text.split()
words.extend(list(set(text)))

text = ' '.join([w[-1] for w in written])
text = text.split()
tags.extend(list(set(text)))

In [21]:
# update set of words and tags
words = list(set(words))
tags = list(set(tags))

In [22]:
# save words and tags
with open('words.txt', 'w') as file:
    file.writelines('\n'.join(words))
with open('tags.txt', 'w') as file:
    file.writelines('\n'.join(tags))    