In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../data/ner_dataset_utf8.csv")

In [3]:
data.iloc[56460:56470]

Unnamed: 0,Sentence #,Word,POS,Tag
56460,,300,CD,O
56461,,°C,NNS,O
56462,,.,.,O
56463,Sentence: 2557,The,DT,O
56464,,Russians,NNS,B-gpe
56465,,use,VBP,O
56466,,a,DT,O
56467,,pencil,NN,O
56468,,.,.,O
56469,Sentence: 2558,Krygyz,JJ,B-gpe


In [6]:
def sentence_string_to_int(data, column, replace='Sentence: '):
    """
    Takes a string column with a text prefix before an integer and
        converts it to an integer column. Observations without prefix
        are set to 0.
    """
    data[column] = data[column].str.replace(replace, "")
    idx = data[column].isna()
    data[column] = data[column].where(~idx, "0")

    return data[column].astype(int)

In [5]:
# Convert Sentence # column to integers with sentence number
# at the location of the first token in the string.
data['Sentence #'] = sentence_string_to_int(data, 'Sentence #')

In [10]:
def find_sentence_breaks(data, column_in, column_out='sentence_break'):
    """
    Inputs: dataframe and column name, optional name of column to create.

    Outputs: dataframe with new column containing True if the row contains
             the final token from a sentence otherwise a False.
    """
    sentence_break = np.array((data[column_in] > 0)[1:])
    data[column_out] = np.insert(sentence_break, -1,False)

    return data

In [11]:
# Add new column with boolean indicating if the row
# is the break between the prior sentence and the current sentence.
data = find_sentence_breaks(data, 'Sentence #')

In [12]:
# Replace Words and Tags between sentences with an empty space
# because Flair's Corpus object prefers (requires?) data in 
# the CONLL3 format.
data['Word'] = np.where(data['sentence_break'], ' ', data['Word'])
data['Tag'] = np.where(data['sentence_break'], ' ', data['Tag'])

In [13]:
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag,sentence_break
0,1,Thousands,NNS,O,False
1,0,of,IN,O,False
2,0,demonstrators,NNS,O,False
3,0,have,VBP,O,False
4,0,marched,VBN,O,False
5,0,through,IN,O,False
6,0,London,NNP,B-geo,False
7,0,to,TO,O,False
8,0,protest,VB,O,False
9,0,the,DT,O,False


In [14]:
df_out = data[['Word', 'Tag']].rename(columns={'Word': 'text', 'Tag': 'ner'})

In [15]:
df_out.to_csv('../data/ner_dataset_utf8.txt', index=False, sep='\t')

In [16]:
!pwd

/Users/jason/svn/flair-custom
