In [1]:
# %pip install flair numpy sklearn gensim

In [2]:
import flair.datasets
corpus = flair.datasets.UD_ENGLISH()

2022-07-02 13:48:10,955 Reading data from C:\Users\sudip\.flair\datasets\ud_english
2022-07-02 13:48:10,956 Train: C:\Users\sudip\.flair\datasets\ud_english\en_ewt-ud-train.conllu
2022-07-02 13:48:10,956 Dev: C:\Users\sudip\.flair\datasets\ud_english\en_ewt-ud-dev.conllu
2022-07-02 13:48:10,956 Test: C:\Users\sudip\.flair\datasets\ud_english\en_ewt-ud-test.conllu


In [3]:
print(corpus)

Corpus: 12543 train + 2001 dev + 2077 test sentences


In [4]:
print(corpus.train[5132].text)


What followed the next day is considered by the governor and Dunn to be the pivotal moment of the energy crisis .


In [5]:
print(corpus.train[5132].to_tagged_string('pos'))


Sentence: "What followed the next day is considered by the governor and Dunn to be the pivotal moment of the energy crisis ." → ["What"/WP, "followed"/VBD, "the"/DT, "next"/JJ, "day"/NN, "is"/VBZ, "considered"/VBN, "by"/IN, "the"/DT, "governor"/NN, "and"/CC, "Dunn"/NNP, "to"/TO, "be"/VB, "the"/DT, "pivotal"/JJ, "moment"/NN, "of"/IN, "the"/DT, "energy"/NN, "crisis"/NN, "."/.]


In [6]:
upos_dictionary = corpus.make_label_dictionary(label_type='upos')
print(upos_dictionary)

2022-07-02 13:48:22,143 Computing label dictionary. Progress:


12543it [00:00, 24837.96it/s]

2022-07-02 13:48:22,652 Dictionary created for label 'upos' with 18 values: NOUN (seen 34761 times), PUNCT (seen 23620 times), VERB (seen 22946 times), PRON (seen 18589 times), ADP (seen 17730 times), DET (seen 16314 times), ADJ (seen 13167 times), AUX (seen 12440 times), PROPN (seen 12345 times), ADV (seen 9462 times), CCONJ (seen 6690 times), PART (seen 5745 times), SCONJ (seen 4554 times), NUM (seen 4119 times), X (seen 704 times), SYM (seen 698 times), INTJ (seen 694 times)
Dictionary with 18 tags: <unk>, NOUN, PUNCT, VERB, PRON, ADP, DET, ADJ, AUX, PROPN, ADV, CCONJ, PART, SCONJ, NUM, X, SYM, INTJ





In [7]:
for token in corpus.train[5132].tokens:
  print(token.tag, token.labels[1].value)


what PRON
follow VERB
the DET
next ADJ
day NOUN
be AUX
consider VERB
by ADP
the DET
governor NOUN
and CCONJ
Dunn PROPN
to PART
be AUX
the DET
pivotal ADJ
moment NOUN
of ADP
the DET
energy NOUN
crisis NOUN
. PUNCT


In [8]:
words = set()
for _corpus in [corpus.train, corpus.dev, corpus.test]:
  for sentence in _corpus:
    for token in sentence.tokens:
      words.add(token.tag)
len(words)

17022

In [9]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec


class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    gensim's own gensim.sklearn_api.W2VTransformer doesn't support out of vocabulary words,
    hence we create our own
    """

    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.size = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.iter = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        valid_words = [word for word in words if word in self.model_.wv.vocab]
        if valid_words:
            embedding = np.zeros(
                (len(valid_words), self.size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.size)


In [10]:
gensim_word2vec_tr = GensimWord2VecVectorizer(
    size=10, min_count=3, sg=1, alpha=0.025, iter=10
)

In [11]:
X_train = []
y_train = []

for sentence in corpus.train:
  for token in sentence.tokens:
    X_train.append(token.tag)
    y_train.append(token.labels[1].value)

In [12]:
gensim_word2vec_tr.fit(X_train)

GensimWord2VecVectorizer(iter=10, min_count=3, sg=1, size=10)

In [22]:
gensim_word2vec_tr.model_.save('word2vec_model.bin')

In [13]:
X_train[5123], gensim_word2vec_tr.transform([X_train[5123]])[0], y_train[5123]

('reestablish',
 array([-0.6197692 ,  0.05817069,  0.01561817,  0.23214982,  0.12287433,
        -0.23274094,  1.009097  ,  0.19509827,  0.48299617, -0.20273222],
       dtype=float32),
 'VERB')

In [14]:
X_train_embeddings = gensim_word2vec_tr.transform(X_train)
X_train_embeddings

array([[-0.26778048,  0.06921566,  0.4347147 , ...,  0.69781482,
         0.16398181,  0.02340554],
       [ 0.01692152, -2.08052397,  0.94598621, ..., -1.69790184,
         0.52791226,  0.26770422],
       [-0.14903422,  0.06417719,  0.3466683 , ...,  0.24111137,
         0.28475243, -0.12943198],
       ...,
       [-0.5206911 , -0.21647924,  0.26263666, ...,  0.18457679,
         0.43908226, -0.17785431],
       [ 0.57867116, -0.57820368, -1.23070264, ..., -0.81327367,
        -0.67024243, -0.34505662],
       [-0.0124165 , -0.43035585, -0.9793402 , ...,  0.05271059,
        -1.1423316 , -0.2548157 ]])

In [15]:
X_test = []
y_test = []

for sentence in corpus.test:
  for token in sentence.tokens:
    X_test.append(token.tag)
    y_test.append(token.labels[1].value)
    
X_test_embeddings = gensim_word2vec_tr.transform(X_test)


In [19]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=.01, max_iter=1000)
nn.fit(X_train_embeddings, y_train)


MLPClassifier(alpha=0.01, max_iter=1000)

In [21]:
from sklearn.metrics import classification_report

y_test_pred = nn.predict(X_test_embeddings)
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

         ADJ       0.66      0.31      0.42      1786
         ADP       0.82      0.85      0.84      2029
         ADV       0.65      0.54      0.59      1126
         AUX       0.90      0.88      0.89      1507
       CCONJ       0.99      0.98      0.99       738
         DET       0.95      0.94      0.95      1897
        INTJ       0.91      0.43      0.59       120
        NOUN       0.59      0.84      0.69      4132
         NUM       0.94      0.93      0.93       541
        PART       0.74      1.00      0.85       649
        PRON       0.94      0.89      0.91      2158
       PROPN       0.89      0.89      0.89      1983
       PUNCT       0.99      1.00      0.99      3098
       SCONJ       0.58      0.47      0.52       445
         SYM       0.94      0.71      0.81       106
        VERB       0.71      0.57      0.63      2641
           X       0.75      0.47      0.58       138

    accuracy              

In [23]:
from joblib import dump, load
dump(nn, 'nn_model.joblib')

['nn_model.joblib']