## CRF model

In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support
import re
import pandas as pd
import numpy as np
import subprocess

In [2]:
data = pd.read_csv('../../../data/Task1.csv')
data = data.rename(columns={'id':'Sentence #'})
data = data.drop('Unnamed: 0',axis=1)
data = data.fillna(method="ffill")

  data = data.fillna(method="ffill")


In [3]:
def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

vec_model = loadGloveModel('../Task2/glove/glove.6B.300d.txt')

Loading Glove Model
400000  words loaded!


In [4]:
class SentenceGetter(object):  
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["words"].values.tolist(),
                                                           s["labels"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

        
words = list(set(data["words"].values))
tags = ['O','B_INC','INC','B_EXC','EXC']
# tags = list(set(data["labels"].values))
n_words = len(words)
n_tags = len(tags)

getter = SentenceGetter(data)
sentences = getter.sentences
emb_dim = len(vec_model['the'])

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    features = {}
    if word in vec_model:
        feat = vec_model[word]
    else:
        feat = np.zeros(emb_dim)
    
    for i in range(emb_dim):
        features[str(i)] = feat[i]


    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [6]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
vec_model.clear()

In [20]:
crf = CRF(algorithm='lbfgs',
          max_iterations=200,
          c1=0.1,
          c2=0.1,
          all_possible_transitions=False)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [28]:
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)

In [29]:
pred = crf.predict(X_test)

In [30]:
tags = ['O','B_INC','INC','B_EXC','EXC']
tag2idx = {t: i for i, t in enumerate(tags)}

def write_results(result,file,ign):
    idx2tag = {i: w for w, i in tag2idx.items()}
    with open(file,'w+') as f:
        for i,lis in enumerate(result):
            line = ""
            for el in lis:
                tag = el
                if tag in ['O',ign,'B_'+ign]:
                    line += "O "
                elif tag[0] == 'B':
                    line += 'B '
                else:
                    line += 'I '
            f.write(line+'\n')
            

def get_softMetrics(pred,labels,ign):

    write_results(pred,'pred.txt',ign)
    write_results(labels,'labels.txt',ign)
    out = subprocess.check_output(['./a.out']).decode('utf-8').split('\n')
    rows = []
    for item in  out:
        item = item.split(" ")
        if len(item) != 2:continue 
        rows.append({'Proportional':item[0],'Binary':item[1]})
    
    return (pd.DataFrame(rows,index=['precision','recall','F1']))

inc = get_softMetrics(pred,y_test,'INC')
exc = get_softMetrics(pred,y_test,'EXC')
display(inc)
display(exc)

Unnamed: 0,Proportional,Binary
precision,0.336954,0.380531
recall,0.534733,0.681034
F1,0.413406,0.48825


Unnamed: 0,Proportional,Binary
precision,0.263156,0.298113
recall,0.550939,0.716981
F1,0.356182,0.421126
