In [2]:
from nltk.stem.porter import PorterStemmer
from typing import List
import pandas as pd
ps = PorterStemmer()

In [16]:
def load_to_pandas(filename: str):
    tokens = []
    poses = []
    bios = []
    with open(filename, 'r') as f:
        for line in f:
            s: List[str] = line.split()
            t, p, b = ('', '', '')
            if len(s) == 3:
                t, p, b = s
            elif len(s) == 2:
                t, p = s
            tokens.append(t)
            poses.append(p)
            if filename.endswith('-chunk'):
                bios.append(b)
        print(len(bios))
    return pd.DataFrame.from_dict({'token':tokens, 'pos':poses, 'bio':bios} if bios else {'token':tokens, 'pos':poses})

PHASE = {
    'training': {
        'in_name': 'WSJ_02-21.pos-chunk',
        'out_name': 'training.feature'
    },
    'development': {
        'in_name': 'WSJ_24.pos',
        'out_name': 'test.feature'
    },
    'test': {
        'in_name': 'WSJ_23.pos',
        'out_name': 'test-real.feature'
    }
}
CURRENT_PHASE = 'training'
training = load_to_pandas(PHASE[CURRENT_PHASE]['in_name'])
print(training.shape)
training.sample()

1979760
(1979760, 3)


Unnamed: 0,token,pos,bio
1612458,British,JJ,I-NP


In [10]:
feature_in = training.iloc[:, :2]
feature_in['stem'] = feature_in['token'].apply(ps.stem).shift(1)
# feature_in['stem_ps'] = feature_in
# feature_in['stem-bi'] = feature_in['stem'].shift(1)
# feature_in['stem-bi'] = feature_in[['stem-bi', 'stem']]
check_point = feature_in.copy()
feature_in.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,token,pos,stem
0,,,
1,The,DT,
2,economy,NN,the


In [11]:
headers: List[str] = check_point.columns.tolist()[:3]
print(headers)
check_point.head(15)

['token', 'pos', 'stem']


Unnamed: 0,token,pos,stem
0,,,
1,The,DT,
2,economy,NN,the
3,'s,POS,economi
4,temperature,NN,'s
5,will,MD,temperatur
6,be,VB,will
7,taken,VBN,be
8,from,IN,taken
9,several,JJ,from


In [12]:
feature_in = check_point.copy()
MEMORY_SPAN = 1
headers = ['pos']
for i in range(MEMORY_SPAN):
    for header in headers:
        n_headers = [header + '_' + str(i + 1), header + '_-' + str(i + 1)]
        feature_in[n_headers[0]] = check_point[header].shift(-i - 1, fill_value='@@')
        feature_in[n_headers[1]] = check_point[header].shift(+i + 1, fill_value='@@')
        feature_in[header + '_bi-' + str(i + 1)] = feature_in[[n_headers[0], header]]\
            .apply(lambda x: '+'.join(x), axis=1)
        feature_in[header + '_bi+' + str(i + 1)] = feature_in[[header, n_headers[1]]]\
            .apply(lambda x: '+'.join(x), axis=1)
        feature_in['tri'] = feature_in[[n_headers[0], header, n_headers[1]]].apply(lambda x: '+'.join(x), axis=1)
feature_in.head(3)     

Unnamed: 0,token,pos,stem,pos_1,pos_-1,pos_bi-1,pos_bi+1,tri
0,,,,DT,@@,DT+,+@@,DT++@@
1,The,DT,,NN,,NN+DT,DT+,NN+DT+
2,economy,NN,the,POS,DT,POS+NN,NN+DT,POS+NN+DT


In [13]:
# feature_in['bio - 1'] = training['bio'].shift(1, fill_value='')
feature_in['bio - 1'] = 'prev=@@'
if CURRENT_PHASE == 'training':
    feature_in['bio'] = training['bio']
feature_in.head(3)

Unnamed: 0,token,pos,stem,pos_1,pos_-1,pos_bi-1,pos_bi+1,tri,bio - 1
0,,,,DT,@@,DT+,+@@,DT++@@,prev=@@
1,The,DT,,NN,,NN+DT,DT+,NN+DT+,prev=@@
2,economy,NN,the,POS,DT,POS+NN,NN+DT,POS+NN+DT,prev=@@


In [14]:
feature_in[feature_in['token'] == ''] = ''
feature_in

Unnamed: 0,token,pos,stem,pos_1,pos_-1,pos_bi-1,pos_bi+1,tri,bio - 1
0,,,,,,,,,
1,The,DT,,NN,,NN+DT,DT+,NN+DT+,prev=@@
2,economy,NN,the,POS,DT,POS+NN,NN+DT,POS+NN+DT,prev=@@
3,'s,POS,economi,NN,NN,NN+POS,POS+NN,NN+POS+NN,prev=@@
4,temperature,NN,'s,MD,POS,MD+NN,NN+POS,MD+NN+POS,prev=@@
...,...,...,...,...,...,...,...,...,...
34195,here,RB,them,IN,PRP,IN+RB,RB+PRP,IN+RB+PRP,prev=@@
34196,with,IN,here,PRP,RB,PRP+IN,IN+RB,PRP+IN+RB,prev=@@
34197,us,PRP,with,.,IN,.+PRP,PRP+IN,.+PRP+IN,prev=@@
34198,.,.,us,,PRP,+.,.+PRP,+.+PRP,prev=@@


In [9]:
feature_in.to_csv(PHASE[CURRENT_PHASE]['out_name'], sep='\t', header=False, index=False)
print('saved file ' + PHASE[CURRENT_PHASE]['out_name'])


saved file test.feature


Run this manually:
```bash
sed -i 's/\t\t\t\t*$//g' *.feature
```
