# Word Embedding for Sequence Processing

**The goal of this practical is to use pre-trained word embedding for adressing the sequence prediction tasks studied in week 2: PoS and chunking.**

In [1]:
import numpy as np
import gensim.downloader as api
from gensim.models import KeyedVectors

## 0) Loading PoS (or chunking) datasets (small or large)

In [73]:
def load(filename):
    listeDoc = list()
    with open(filename, "r") as f:
        doc = list()
        for ligne in f:
            #print "l : ",len(ligne)," ",ligne
            if len(ligne) < 2: # fin de doc
                listeDoc.append(doc)
                doc = list()
                continue
            mots = ligne.replace("\n","").split(" ")
            # mettre mots[2] à la place de mots[1] pour le chuncking
            doc.append((mots[0],mots[1]))
    return listeDoc

In [74]:
bSmall = True

if(bSmall==True):
    filename = "ressources/conll2000/chtrain.txt" 
    filenameT = "ressources/conll2000/chtest.txt" 

else:
    # Larger corpus .
    filename = "ressources/conll2000/train.txt" 
    filenameT = "ressources/conll2000/test.txt" 

alldocs = load(filename)
alldocsT = load(filenameT)

print(len(alldocs)," docs read")
print(len(alldocsT)," docs (T) read")

823  docs read
77  docs (T) read


# 1) Word embedding for classifying each word

### Pre-trained word2vec

In [76]:
import gensim.downloader as api
bload = True
fname = "word2vec-google-news-300"
sdir = "ressources/" # Change

if(bload==True):
    wv_pre_trained = KeyedVectors.load(sdir+fname+".dat")
else:    
    wv_pre_trained = api.load(fname)
    wv_pre_trained.save(sdir+fname+".dat")

### Some token on the dataset are missing, we will encode them with a random vector
This is sub-optimal, but we need to do something

In [77]:
def randomvec():
    default = np.random.randn(300)
    default = default  / np.linalg.norm(default)
    return default

In [78]:
np.random.seed(seed=10) # seed the randomness

dictadd = dict()
cpt=0
for d in alldocs:
    cpt+=1
    print(" ****** Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()
            
for d in alldocsT:
    cpt+=1
    print(" ****** TEST Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()
#             wv_pre_trained.add_vector(x,randomvec())

 ****** Document ****** 1
's  not in WE, adding it with random vector
a  not in WE, adding it with random vector
to  not in WE, adding it with random vector
747  not in WE, adding it with random vector
.  not in WE, adding it with random vector
 ****** Document ****** 2
200  not in WE, adding it with random vector
so-called  not in WE, adding it with random vector
 ****** Document ****** 3
,  not in WE, adding it with random vector
and  not in WE, adding it with random vector
 ****** Document ****** 4
793  not in WE, adding it with random vector
of  not in WE, adding it with random vector
 ****** Document ****** 5
 ****** Document ****** 6
 ****** Document ****** 7
59  not in WE, adding it with random vector
 ****** Document ****** 8
 ****** Document ****** 9
SHEARSON  not in WE, adding it with random vector
 ****** Document ****** 10
42  not in WE, adding it with random vector
Balcor  not in WE, adding it with random vector
 ****** Document ****** 11
 ****** Document ****** 12
 ******

 ****** Document ****** 498
financial-services  not in WE, adding it with random vector
 ****** Document ****** 499
 ****** Document ****** 500
 ****** Document ****** 501
department-store  not in WE, adding it with random vector
 ****** Document ****** 502
Sigoloff  not in WE, adding it with random vector
 ****** Document ****** 503
80  not in WE, adding it with random vector
 ****** Document ****** 504
 ****** Document ****** 505
77  not in WE, adding it with random vector
 ****** Document ****** 506
 ****** Document ****** 507
 ****** Document ****** 508
62  not in WE, adding it with random vector
2.5  not in WE, adding it with random vector
 ****** Document ****** 509
 ****** Document ****** 510
 ****** Document ****** 511
 ****** Document ****** 512
 ****** Document ****** 513
1929  not in WE, adding it with random vector
 ****** Document ****** 514
 ****** Document ****** 515
 ****** Document ****** 516
409  not in WE, adding it with random vector
 ****** Document ****** 517
 ***

### Add the (key-value) 'random' word embeddings for missing inputs

In [79]:
## YOUR CODE HERE
wv_pre_trained.add_vectors(list(dictadd.keys()),list(dictadd.values())) 

### Store the train and test datasets: a word embedding for each token in the sequences

In [80]:
wvectors  = []
for doc in alldocs:
    for token, tag in doc:
        t_vec = wv_pre_trained[token]
        wvectors.append(t_vec)
    
wvectorsT  = []
for doc in alldocsT:
    for token, tag in doc:
        try:
            t_vec = wv_pre_trained[token]
            wvectorsT.append(t_vec)
        except:
            print(token)
            pass
    
len(wvectors), len(wvectorsT)

(19172, 1896)

### Check the size of your train/test datasets

### Collecting train/test labels

In [81]:
# Labels train/test

buf2 = [[pos for m,pos in d ] for d in alldocs]
cles = []
[cles.extend(b) for b in buf2]
cles = np.unique(np.array(cles))
cles2ind = dict(zip(cles,range(len(cles))))
nCles = len(cles)
print(nCles," keys in the dictionary")

labels  = np.array([cles2ind[pos] for d in alldocs for (m,pos) in d ])
#np.array([cles2ind[pos] for (m,pos) in d for d in alldocs])
labelsT  = np.array([cles2ind.setdefault(pos,len(cles)) for d in alldocsT for (m,pos) in d ])

print(len(cles2ind)," keys in the dictionary")

42  keys in the dictionary
43  keys in the dictionary


In [82]:
print(labels.shape)
print(labelsT.shape)

(19172,)
(1896,)


### Train a Logistic Regression Model! 
**An compare performances to the baseline and sequence models (HMM/CRF) or practical 2a**

In [83]:
import warnings

# Ignore DeprecationWarning
warnings.simplefilter("ignore")

In [84]:
## YOUR CODE HERE
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Create a stratified cross-validation object
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the logistic regression model
model = LogisticRegression()

# Define the metrics for scoring
scoring = {
    'accuracy': make_scorer(accuracy_score),
}

# Perform cross-validation with different metrics for scoring
cv_results = cross_validate(model, np.array(wvectors), labels, cv=cv, scoring=scoring)

# Print test results
print('Accuracy:', cv_results['test_accuracy'])
# print('Precision:', cv_results['test_precision'])
# print('Recall:', cv_results['test_recall'])
# print('F1 score:', cv_results['test_f1'])
# print('ROC AUC score:', cv_results['test_roc_auc'])

Accuracy: [0.9243807  0.92125163 0.92018779 0.92749087 0.92357851]


In [85]:
model.fit(np.array(wvectors),labels )

In [86]:
model.score(np.array(wvectorsT), labelsT)

0.9129746835443038

### HMM

In [31]:
# allx: list of observation sequences 
# allq: list os state sequences 
# N: nb states
# K: nb observations

def learnHMM(allx, allq, N, K, initTo1=True):
    """
    Computes the parameters of the hmm model to learn
    """
    if initTo1:
        eps = 1e-3 # You can play with this regularization parameter 
        A = np.ones((N,N))*eps
        B = np.ones((N,K))*eps
        Pi = np.ones(N)*eps
    else:
        A = np.zeros((N,N))
        B = np.zeros((N,K))
        Pi = np.zeros(N)
    # Iterate over all sequences and their respective observations
    for x,q in zip(allx,allq):
        # Compute Pi vector based on first words proba
        Pi[int(q[0])] += 1
        for i in range(len(q)-1):
            A[int(q[i]),int(q[i+1])] += 1
            B[int(q[i]),int(x[i])] += 1
        B[int(q[-1]),int(x[-1])] += 1 # last transition
    A = A/np.maximum(A.sum(1).reshape(N,1),1) # normalisation
    B = B/np.maximum(B.sum(1).reshape(N,1),1) # normalisation
    Pi = Pi/Pi.sum()
    return Pi , A, B

def viterbi(x,Pi,A,B):
    """
    Finds the state sequence that maximizes the likelihood of the observed sequence
    """
    T = len(x)
    N = len(Pi)
    logA = np.log(A)
    logB = np.log(B)
    logdelta = np.zeros((N,T))
    psi = np.zeros((N,T), dtype=int)
    S = np.zeros(T)
    logdelta[:,0] = np.log(Pi) + logB[:,int(x[0])]
    #forward
    for t in range(1,T):
        logdelta[:,t] = (logdelta[:,t-1].reshape(N,1) + logA).max(0) + logB[:,int(x[t])]
        psi[:,t] = (logdelta[:,t-1].reshape(N,1) + logA).argmax(0)
    # backward
    logp = logdelta[:,-1].max()
    S[T-1] = logdelta[:,-1].argmax()
    for i in range(2,T+1):
        S[int(T-i)] = psi[int(S[int(T-i+1)]),int(T-i+1)]
    return S, logp #, delta, psi

In [32]:
# alldocs etant issu du chargement des données
# la mise en forme des données est fournie ici
# afin de produire des analyses qualitative, vous devez malgré tout comprendre le fonctionnement des dictionnaires

buf = [[m for m,pos in d ] for d in alldocs]
mots = []
[mots.extend(b) for b in buf]
mots = np.unique(np.array(mots))
nMots = len(mots)+1 # mot inconnu

mots2ind = dict(zip(mots,range(len(mots))))
mots2ind["UUUUUUUU"] = len(mots)

buf2 = [[pos for m,pos in d ] for d in alldocs]
cles = []
[cles.extend(b) for b in buf2]
cles = np.unique(np.array(cles))
cles2ind = dict(zip(cles,range(len(cles))))

nCles = len(cles)

print(nMots,nCles," in the dictionary")

# mise en forme des données
allx  = [[mots2ind[m] for m,pos in d] for d in alldocs]
allxT = [[mots2ind.setdefault(m,len(mots)) for m,pos in d] for d in alldocsT]

allq  = [[cles2ind[pos] for m,pos in d] for d in alldocs]
allqT = [[cles2ind.setdefault(pos,len(cles)) for m,pos in d] for d in alldocsT]

4570 17  in the dictionary


In [33]:
pi , A, B = learnHMM(allx, allq, nCles, nMots, initTo1=True)
seq, log_probs = [],[]
for d in allx:
    s,p = viterbi(d,pi, A, B)
    seq.append(s)
    log_probs.append(p)

In [34]:
# HMM decoding and performances evaluation
# Evaluate test performances
cpt=0
pred = 0
y_hat,y = [],[]
for doc,state in zip(allxT,allqT) :
    for p_pred, p_real in zip(viterbi(doc,pi,A,B)[0],state):
        if p_pred == p_real :
            y_hat.append(p_pred)
            y.append(p_real)
            cpt+=1
            
accuracy_score(y, y_hat)

1.0

## CRF

In [87]:
# !pip install python-crfsuite
import nltk
from nltk.tag.crf import CRFTagger
tagger = CRFTagger()
tagger.train(alldocs, './out/crf.model') # training
tagger.evaluate(alldocsT)

0.9071729957805907

In [88]:
# perceptron
from nltk.tag.perceptron    import PerceptronTagger
tagger = PerceptronTagger(load=False)
tagger.train(alldocs)
tagger.evaluate(alldocsT)

0.9161392405063291

# 2) Using word embedding with CRF

## We will define the following features functions for CRF

In [89]:
def features_wv(sentence, index):
    """
        This function encodes the pre-trained word vectors for each word in the sentence 
        using a pre-trained word embedding model. 
        The function takes in a sentence and an index, and returns a dictionary of 300 features,
        where each feature corresponds to a dimension in the word vector.
    """
    v = wv_pre_trained.get_vector(sentence[index])
    d = {'f'+str(i):v[i] for i in range(300)}
    return d

def features_structural(sentence, index):
    """
    This function encodes various structural features of each word in the sentence, 
    such as its position in the sentence, whether it is capitalized, whether it 
    contains a hyphen, etc. 
    The function takes in a sentence and an index, and returns a dictionary of 16 features.

    """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
     ## We will define the following features functions for CRF## We will define the following features functions for CRF   'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
def features_wv_plus_structural(sentence, index):
    """
    This function combines the features from the previous two functions. 
    It encodes both the pre-trained word vectors and the structural features 
    for each word in the sentence. 
    The function takes in a sentence and an index, and returns a dictionary 
    of 316 features (300 from word vectors + 16 from structural features).
    """
    v = wv_pre_trained.get_vector(sentence[index]) 
    d = {'f'+str(i):v[i] for i in range(300)}

    return {**d, **features_structural(sentence, index)}

In [90]:
# features_wv(alldocs[0], 0)

sentence = ' '.join(str(elem) for elem, tag in alldocs[0])
print(sentence.split()[5])
features_wv_plus_structural(sentence.split(), 5)

unit


{'f0': -0.10595703,
 'f1': 0.00029945374,
 'f2': 0.041503906,
 'f3': -0.014404297,
 'f4': -0.16015625,
 'f5': 0.103515625,
 'f6': 0.007385254,
 'f7': -0.06201172,
 'f8': 0.18945312,
 'f9': -0.106933594,
 'f10': 0.1484375,
 'f11': -0.14941406,
 'f12': 0.00042915344,
 'f13': -0.27539062,
 'f14': -0.17578125,
 'f15': 0.016845703,
 'f16': -0.083496094,
 'f17': -0.3125,
 'f18': -0.109375,
 'f19': -0.076660156,
 'f20': -0.021362305,
 'f21': -0.20703125,
 'f22': -0.12597656,
 'f23': -0.07910156,
 'f24': 0.0077209473,
 'f25': -0.1015625,
 'f26': -0.20898438,
 'f27': 0.087402344,
 'f28': -0.27539062,
 'f29': 0.015136719,
 'f30': 0.068847656,
 'f31': -0.13574219,
 'f32': -0.14941406,
 'f33': 0.030151367,
 'f34': 0.056152344,
 'f35': 0.072265625,
 'f36': 0.02331543,
 'f37': -0.08886719,
 'f38': -0.018310547,
 'f39': 0.103515625,
 'f40': 0.011657715,
 'f41': -0.19042969,
 'f42': -0.033447266,
 'f43': 0.19824219,
 'f44': -0.09326172,
 'f45': -0.21875,
 'f46': 0.060546875,
 'f47': 0.040283203,
 'f48

## [Question]: explain what the 3 feature functions encode and what their differences are

The three feature functions encode different types of information about each word in a sentence:

1. **features_wv(sentence, index):** This function encodes the pre-trained word vectors for each word in the sentence using a pre-trained word embedding model. The function takes in a sentence and an index, and returns a dictionary of 300 features, where each feature corresponds to a dimension in the word vector.

2. **features_structural(sentence, index):** This function encodes various structural features of each word in the sentence, such as its position in the sentence, whether it is capitalized, whether it contains a hyphen, etc. The function takes in a sentence and an index, and returns a dictionary of 16 features.

3. **features_wv_plus_structural(sentence, index):** This function combines the features from the previous two functions. It encodes both the pre-trained word vectors and the structural features for each word in the sentence. The function takes in a sentence and an index, and returns a dictionary of 316 features (300 from word vectors + 16 from structural features).

**The main difference** between the three feature functions is the type of information they encode. The first function encodes semantic information about each word, the second encodes syntactic and structural information, and the third combines both semantic and structural information. These different types of information can be useful for different types of natural language processing tasks. For example, the semantic information encoded by the first function might be useful for tasks like word similarity or document classification, while the structural information encoded by the second function might be useful for tasks like part-of-speech tagging or named entity recognition. The third function combines both types of information, which might be useful for tasks that require both semantic and structural information, such as semantic role labeling or coreference resolution.

### You can now train a CRF with the 3 features and analyse the results

**feature_func:** 

The function that extracts features for each token of a sentence. This function should take
            2 parameters: tokens and index which extract features at index position from tokens list. See the build in
            _get_features function for more detail.

In [91]:
# Feature func 1

import nltk
from nltk.tag.crf import CRFTagger
tagger = CRFTagger(feature_func=features_wv)
tagger.train(alldocs, './out/crf_f1.model') # training
tagger.evaluate(alldocsT)

0.9140295358649789

In [92]:
# Feature func 2

import nltk
from nltk.tag.crf import CRFTagger
tagger = CRFTagger(feature_func=features_structural)
tagger.train(alldocs, './out/crf_f2.model') # training
tagger.evaluate(alldocsT)

0.929324894514768

In [None]:
# Feature func 3

import nltk
from nltk.tag.crf import CRFTagger
tagger = CRFTagger(feature_func=features_wv_plus_structural)
tagger.train(alldocs, './out/crf_f3.model') # training
tagger.evaluate(alldocsT)