In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [2]:
import pandas as pd


In [3]:
cols = ['WORDS', 'POP', 'POP_2', 'NE'] 
train_pdf = pd.read_csv('./../data/CONLL_ENG_NER_2003/train_conll2003.txt', header=None, delim_whitespace=True)
train_pdf.columns = cols
print("Train shape")
train_pdf.shape

test_pdf = pd.read_csv('./../data/CONLL_ENG_NER_2003/test_conll2003.txt', header=None , delim_whitespace=True)
test_pdf.columns = cols
print("Test shape")
test_pdf.shape



Train shape


(204567, 4)

Test shape


(46666, 4)

In [4]:
train_pdf.head()
test_pdf.head()

Unnamed: 0,WORDS,POP,POP_2,NE
0,-DOCSTART-,-X-,-X-,O
1,EU,NNP,B-NP,B-ORG
2,rejects,VBZ,B-VP,O
3,German,JJ,B-NP,B-MISC
4,call,NN,I-NP,O


Unnamed: 0,WORDS,POP,POP_2,NE
0,-DOCSTART-,-X-,-X-,O
1,SOCCER,NN,B-NP,O
2,-,:,O,O
3,JAPAN,NNP,B-NP,B-LOC
4,GET,VB,B-VP,O


### Filter out the Document starter tokens

In [5]:
train_pdf['WORDS'] = train_pdf['WORDS'].apply(lambda x : str(x).strip())
train_nodoc_pdf = train_pdf.loc[~train_pdf['WORDS'].isin(['-DOCSTART-'])]
train_nodoc_pdf.head()
print("Train shape")
train_nodoc_pdf.shape


Unnamed: 0,WORDS,POP,POP_2,NE
1,EU,NNP,B-NP,B-ORG
2,rejects,VBZ,B-VP,O
3,German,JJ,B-NP,B-MISC
4,call,NN,I-NP,O
5,to,TO,B-VP,O


Train shape


(203621, 4)

In [6]:
test_pdf['WORDS'] = test_pdf['WORDS'].apply(lambda x : str(x).strip())
test_nodoc_pdf = test_pdf.loc[~test_pdf['WORDS'].isin(['-DOCSTART-'])]
test_nodoc_pdf.head()
print("Test shape")
test_nodoc_pdf.shape


Unnamed: 0,WORDS,POP,POP_2,NE
1,SOCCER,NN,B-NP,O
2,-,:,O,O
3,JAPAN,NNP,B-NP,B-LOC
4,GET,VB,B-VP,O
5,LUCKY,NNP,B-NP,O


Test shape


(46435, 4)

### Check the non-words in train set

In [8]:
test_nodoc_pdf.loc[test_nodoc_pdf['POP']==':']

Unnamed: 0,WORDS,POP,POP_2,NE
2,-,:,O,O
331,:,:,O,O
423,-,:,O,O
625,:,:,O,O
692,-,:,O,O
...,...,...,...,...
45989,-,:,O,O
46163,-,:,O,O
46182,:,:,O,O
46208,:,:,O,O


### Check the Parts of speech types

In [9]:
test_nodoc_pdf.loc[test_nodoc_pdf['POP'].str.len()==1]['POP'].unique()

array([':', ',', '.', 'O', '(', ')', '$'], dtype=object)

In [10]:
test_nodoc_pdf.loc[test_nodoc_pdf['POP'].str.len()==2]['POP'].unique()

array(['NN', 'VB', 'IN', 'DT', 'CD', 'JJ', 'CC', 'TO', 'RB', 'RP', "''",
       'WP', 'MD', 'LS', 'FW', 'EX', 'UH'], dtype=object)

In [11]:
test_nodoc_pdf.loc[test_nodoc_pdf['POP'].str.len()==3]['POP'].unique()

array(['NNP', 'VBD', 'VBP', 'PRP', 'VBG', 'NNS', 'JJS', 'WRB', 'WDT',
       'VBN', 'POS', 'VBZ', 'JJR', 'SYM', 'RBS', 'RBR', 'WP$', 'PDT'],
      dtype=object)

### Check the Name entity types

In [12]:
test_nodoc_pdf.loc[test_nodoc_pdf['NE'].str.len()>0]['NE'].unique()

array(['O', 'B-LOC', 'B-PER', 'I-PER', 'I-LOC', 'B-MISC', 'I-MISC',
       'B-ORG', 'I-ORG'], dtype=object)

### Seperate necessary columns

In [7]:
train_cola_pdf = train_nodoc_pdf[['WORDS', 'NE']]
train_cola_pdf.head()

Unnamed: 0,WORDS,NE
1,EU,B-ORG
2,rejects,O
3,German,B-MISC
4,call,O
5,to,O


In [9]:
test_cola_pdf = test_nodoc_pdf[['WORDS', 'NE']]
test_cola_pdf.head()

Unnamed: 0,WORDS,NE
1,SOCCER,O
2,-,O
3,JAPAN,B-LOC
4,GET,O
5,LUCKY,O


In [14]:
train_cola_pdf.to_csv('../data/CONLL_ENG_NER_2003/ner_only/train_cola.csv',index=False, encoding='utf-8')


In [15]:
test_cola_pdf.to_csv('../data/CONLL_ENG_NER_2003/ner_only/test_cola.csv',index=False, encoding='utf-8')