# POS Tagging for Formality Transfer
This will learn how to tag parts of speech and tag on the sequences. The parts of speech and the sequence will be fed into two separate encoders and then concatenated. 

In [2]:
import sklearn_crfsuite
import nltk
import re 

from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

## Load Data

In [3]:
BASE_PATH = '../../Data'

FORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt'.format(BASE_PATH)
INFORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt'.format(BASE_PATH)

FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt'.format(BASE_PATH)
INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt'.format(BASE_PATH)

POS_TRAIN_PATH = BASE_PATH + '/POS Data/pos.train.txt'
POS_TEST_PATH = BASE_PATH + '/POS Data/pos.test.txt'

In [7]:
formal = open(FORMAL_PATH_TRAIN).read()
informal = open(INFORMAL_PATH_TRAIN).read()

formal_holdout = open(FORMAL_PATH_HOLDOUT).read()
informal_holdout = open(INFORMAL_PATH_HOLDOUT).read()

In [46]:
if_corpus = [seq.split() for seq in informal.split('\n')]

if_holdout = [seq.split() for seq in informal_holdout.split('\n')]

## Load POS Data

In [10]:
from nltk.corpus import treebank

train_data = treebank.tagged_sents()[:3000]  
test_data = treebank.tagged_sents()[3000:]

print('{} training sequences'.format(len(train_data)))
print('{} testing sequence'.format(len(test_data)))

3000 training sequences
914 testing sequence


## Define Features

In [59]:
def word2features(sent, i, corpus=False):
    if not corpus:
        word = sent[i][0]

        features = {
            'prefix3': word[:3],
            'prefix2': word[:2],
            'prefix1': word[:1],
            'suffix1': word[-1:],
            'suffix2': word[-2:],
            'suffix3': word[-3:],
            'prev_word': '' if i == 0 else sent[i-1][0],
            'next_word': '' if i == len(sent) - 1 else sent[i+1][0],
            'first': i == 0,
            'last': i == len(sent) - 1
        }
    else:
        word = sent[i]
        features = {
            'prefix3': word[:3],
            'prefix2': word[:2],
            'prefix1': word[:1],
            'suffix1': word[-1:],
            'suffix2': word[-2:],
            'suffix3': word[-3:],
            'prev_word': '' if i == 0 else sent[i-1],
            'next_word': '' if i == len(sent) - 1 else sent[i+1],
            'first': i == 0,
            'last': i == len(sent) - 1
        }
    
    return features

def sent2features(sent, corpus=False):
    return [word2features(sent, i, corpus) for i in range(len(sent))]

def sent2pos(sent):
    return [pos for _, pos in sent]

## Split Data

In [31]:
X_train = [sent2features(seq) for seq in train_data] 
y_train = [sent2pos(seq) for seq in train_data]

X_test = [sent2features(seq) for seq in train_data]
y_test = [sent2pos(seq) for seq in train_data]

### Train

In [52]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [53]:
y_pred = crf.predict(X_test)

In [54]:
sorted_labels = sorted(
    crf.classes_,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           #      1.000     1.000     1.000        13
           $      1.000     1.000     1.000       469
           ,      1.000     1.000     1.000      3780
           .      1.000     1.000     1.000      2983
           :      1.000     1.000     1.000       482
          ''      1.000     1.000     1.000       602
          RB      0.980     0.974     0.977      2277
          VB      0.990     0.988     0.989      2004
         VBD      0.989     0.986     0.987      2139
         VBG      0.976     0.988     0.982      1154
         VBN      0.976     0.986     0.981      1612
         VBP      0.985     0.987     0.986      1144
         RBR      0.982     0.915     0.947       118
         RBS      1.000     0.966     0.982        29
         VBZ      0.997     0.997     0.997      1767
          CC      0.995     1.000     0.997      1762
          CD      0.997     0.998     0.998      2338
          MD      1.000    

## Predict on Data

In [65]:
if_input = [sent2features(seq, True) for seq in if_corpus]
if_hol_pos = [sent2features(seq, True) for seq in if_holdout]

In [66]:
train_preds = crf.predict(if_input)
holdout_preds = crf.predict(if_hol_pos)

## Save Data

In [68]:
with open('S_Informal_EM_Train_POS.txt', 'w') as f:
    for seq in train_preds:
        f.write(' '.join(seq) + '\n')

with open('S_Informal_EM_ValTest_POS.txt', 'w') as f:
    for seq in holdout_preds:
        f.write(' '.join(seq) + '\n')