# Process data into preferred format
* NER Dataset 
* CorNell Conference
* UPenn POS Dataset

## import dependencies

In [95]:
import os
import re
import pandas as pd
import numpy as np

### NER Dataset

In [96]:
data = pd.read_csv('./ner_dataset.csv', encoding = 'latin1')
data = data.fillna(method="ffill")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [97]:
tags = list(set(data["Tag"].values))
n_tags = len(tags);
print("Tags: ", tags)

Tags:  ['O', 'B-per', 'B-tim', 'I-nat', 'B-geo', 'I-per', 'I-org', 'I-art', 'B-nat', 'B-org', 'B-gpe', 'I-geo', 'I-gpe', 'B-eve', 'I-tim', 'B-art', 'I-eve']


In [99]:
data['Tag'] = data['Tag'].apply(lambda x : re.sub('gpe', 'geo', x) if 'gpe' in x else x)

In [123]:
agg_func = lambda input : ';'.join([','.join([w,t]) for w, p, t in zip(input['Word'].values.tolist(), input['POS'].values.tolist(), input['Tag'].values.tolist())])
grouped_data = data.groupby('Sentence #').apply(agg_func).reset_index().rename(columns = {0 : 'Sentence'})['Sentence']

In [138]:
grouped_data

0        Thousands,O;of,O;demonstrators,O;have,O;marche...
1        Iranian,B-geo;officials,O;say,O;they,O;expect,...
2        Helicopter,O;gunships,O;Saturday,B-tim;pounded...
3        They,O;left,O;after,O;a,O;tense,O;hour-long,O;...
4        U.N.,B-geo;relief,O;coordinator,O;Jan,B-per;Eg...
                               ...                        
47954    Opposition,O;leader,O;Mir,O;Hossein,B-per;Mous...
47955    On,O;Thursday,B-tim;,,O;Iranian,B-geo;state,O;...
47956    Following,O;Iran,B-geo;'s,O;disputed,O;June,B-...
47957    Since,O;then,O;,,O;authorities,O;have,O;held,O...
47958    The,O;United,B-org;Nations,I-org;is,O;praising...
Name: Sentence, Length: 47959, dtype: object

In [125]:
with open('ner_dataset.txt', 'w') as file:
    file.writelines('\n'.join(grouped_data))

### WNUT 17 Dataset

In [104]:
file = os.path.join(os.getcwd(), 'emerging_entities_17', 'wnut17train.conll')

In [118]:
with open(file, 'r') as f:
    df = f.read()
    df = df.split('\n\t\n') # split pargraph
    df = [d.split('\n') for d in df] # tokenize
    df = [[(w.split('\t')[0], w.split('\t')[-1]) for w in d] for d in df] # split word and tag

In [120]:
def tag_edit(input):
    if 'group' in input:
        return re.sub('group', 'org', input)
    elif  'corporation' in input:
        return re.sub('corporation', 'org', input)
    elif 'location' in input:
        return re.sub('location', 'geo', input)
    elif 'product' in input:
        return re.sub('product', 'art', input)
    elif 'creative-work' in input:
        return re.sub('creative-work', 'art', input)
    elif 'person' in input:
        return re.sub('person', 'per', input)
    return input
df = [[(w[0], tag_edit(w[-1])) for w in d] for d in df]

In [126]:
df

[[('@paulwalk', 'O'),
  ('It', 'O'),
  ("'s", 'O'),
  ('the', 'O'),
  ('view', 'O'),
  ('from', 'O'),
  ('where', 'O'),
  ('I', 'O'),
  ("'m", 'O'),
  ('living', 'O'),
  ('for', 'O'),
  ('two', 'O'),
  ('weeks', 'O'),
  ('.', 'O'),
  ('Empire', 'B-geo'),
  ('State', 'I-geo'),
  ('Building', 'I-geo'),
  ('=', 'O'),
  ('ESB', 'B-geo'),
  ('.', 'O'),
  ('Pretty', 'O'),
  ('bad', 'O'),
  ('storm', 'O'),
  ('here', 'O'),
  ('last', 'O'),
  ('evening', 'O'),
  ('.', 'O')],
 [('From', 'O'),
  ('Green', 'O'),
  ('Newsfeed', 'O'),
  (':', 'O'),
  ('AHFA', 'B-org'),
  ('extends', 'O'),
  ('deadline', 'O'),
  ('for', 'O'),
  ('Sage', 'O'),
  ('Award', 'O'),
  ('to', 'O'),
  ('Nov', 'O'),
  ('.', 'O'),
  ('5', 'O'),
  ('http://tinyurl.com/24agj38', 'O')],
 [('Pxleyes', 'B-org'),
  ('Top', 'O'),
  ('50', 'O'),
  ('Photography', 'O'),
  ('Contest', 'O'),
  ('Pictures', 'O'),
  ('of', 'O'),
  ('August', 'O'),
  ('2010', 'O'),
  ('...', 'O'),
  ('http://bit.ly/bgCyZ0', 'O'),
  ('#photography', 'O')],


In [128]:
written = [';'.join([','.join([w[0], w[-1]]) for w in d]) for d in df]

In [129]:
written

["@paulwalk,O;It,O;'s,O;the,O;view,O;from,O;where,O;I,O;'m,O;living,O;for,O;two,O;weeks,O;.,O;Empire,B-geo;State,I-geo;Building,I-geo;=,O;ESB,B-geo;.,O;Pretty,O;bad,O;storm,O;here,O;last,O;evening,O;.,O",
 'From,O;Green,O;Newsfeed,O;:,O;AHFA,B-org;extends,O;deadline,O;for,O;Sage,O;Award,O;to,O;Nov,O;.,O;5,O;http://tinyurl.com/24agj38,O',
 'Pxleyes,B-org;Top,O;50,O;Photography,O;Contest,O;Pictures,O;of,O;August,O;2010,O;...,O;http://bit.ly/bgCyZ0,O;#photography,O',
 'today,O;is,O;my,O;last,O;day,O;at,O;the,O;office,O;.,O',
 "4Dbling,B-per;'s,O;place,O;til,O;monday,O;,,O;party,O;party,O;party,O;.,O;&lt;,O;3,O",
 "watching,O;the,O;VMA,B-art;pre-show,O;again,O;lol,O;it,O;was,O;n't,O;even,O;a,O;good,O;show,O;the,O;first,O;time,O;...,O;so,O;bored,O;!,O",
 '27,O;followers,O;!,O;30,O;followers,O;is,O;my,O;goal,O;for,O;today,O;!,O',
 "This,O;is,O;the,O;2nd,O;hospital,O;ive,O;been,O;in,O;today,O;,,O;but,O;ive,O;just,O;seen,O;a,O;doctor,O;who,O;was,O;an,O;older,O;version,O;of,O;justin,O;:',O;),O"

In [144]:
for d in written:
    print(type(d))
    break

<class 'str'>


In [146]:
with open('wnut17train_conll.txt', 'w') as file:
    file.writelines('\n'.join([str(d) for d in written]))