# **Loading pre-trained embeddings, creating features for each dataset, and saving the features and tags for further processing in the NER task.**

1. Importing Libraries and data



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. numpy is imported as np for numerical operations.

2. pickle is imported as pkl for serializing and deserializing Python objects.

3. KeyedVectors from gensim.models.keyedvectors is imported to load and work with pre-trained word embeddings.

4. gc module provides an interface to the garbage collector for controlling how the garbage collector interacts with an application.

5. Pandas module is used for data preprocessing in a dataframe

In [None]:
#CreateFeaturesWithWordEmbedding

import numpy as np
import pickle as pkl
from gensim.models.keyedvectors import KeyedVectors
import gc
import pandas as pd

Import the train csv file as a dataframe

In [None]:
data = pd.read_csv('/content/drive/MyDrive/ner/train.txt', encoding= 'unicode_escape',sep=" ",header=None, names=["Token", "POS","Phrase","NER"])
data.dropna(axis=0)
data.head()

Unnamed: 0,Token,POS,Phrase,NER
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O




---



**Below are all the POS tags in the dataset. Since we are only concerned with nouns for NER task, we remove all non-noun rows**

NNP - Proper noun, singular

VBZ - Verb, third person singular present

JJ - Adjective

NN - Noun, singular or mass

TO - to

VB - Verb, base form

. - Sentence-ending punctuation

CD - Cardinal number

DT - Determiner

VBD - Verb, past tense

IN - Preposition or subordinating conjunction

PRP - Personal pronoun

NNS - Noun, plural

VBP - Verb, non-3rd person singular present

MD - Modal (could, will, would, etc.)

VBN - Verb, past participle

POS - Possessive ending (parent's, children's)

JJR - Adjective, comparative

O - Pronoun

RB - Adverb

, - Comma

FW - Foreign word

CC - Coordinating conjunction

WDT - Wh-determiner (which, that)

( - Left parenthesis

) - Right parenthesis

: - Colon

PRP$ - Possessive pronoun (my, your, our)

RBR - Adverb, comparative

VBG - Verb, gerund or present participle

EX - Existential there

WP - Wh-pronoun (what, who)

WRB - Wh-adverb (how, where)

-X- - Unknown

$ - Dollar sign

RP - Particle

NNPS - Proper noun, plural

SYM - Symbol

RBS - Adverb, superlative

UH - Interjection

PDT - Predeterminer (all, both)

'' - Closing quotation mark

LS - List item marker

JJS - Adjective, superlative

WP$ - Possessive wh-pronoun (whose)

NN|SYM - Noun, singular or mass or symbol





In [None]:
data['POS'].unique()

NNP: Proper noun, singular

NN: Noun, singular or mass

NNPS: Proper noun, plural

NNS: Noun, plural

NN|SYM: Compound noun, singular, or symbol

In [None]:
wanted_POS_Tags = ['NNP', 'NN', 'NNPS', 'NNS', 'NN|SYM']
data = data[data['POS'].isin(wanted_POS_Tags)]
data.head()

Unnamed: 0,Token,POS,Phrase,NER
0,EU,NNP,B-NP,B-ORG
3,call,NN,I-NP,O
7,lamb,NN,I-NP,O
9,Peter,NNP,B-NP,B-PER
10,Blackburn,NNP,I-NP,I-PER


Create One hot encoding tags for the columns

In [None]:
def create_OHE(data,col):
  un=data[col].unique()
  labels_={un[i]:i for i in range(len(un))}
  tags=[]
  for i in data[col]:
    tag=[0]*10
    index=labels_.get(i,9)
    tag[index]=1
    tags.append(tag)
  data[col]=tags
  return labels_

In [None]:
ner_labels = create_OHE(data,'NER')

In [None]:
ner_labels

{'B-ORG': 0,
 'O': 1,
 'B-PER': 2,
 'I-PER': 3,
 'B-LOC': 4,
 'I-ORG': 5,
 'B-MISC': 6,
 'I-MISC': 7,
 'I-LOC': 8}

In [12]:
data.head()

Unnamed: 0,Token,POS,Phrase,NER
0,EU,NNP,B-NP,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,call,NN,I-NP,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
7,lamb,NN,I-NP,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
9,Peter,NNP,B-NP,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
10,Blackburn,NNP,I-NP,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


Load pretrained word2vec model

In [13]:
def load_word_embeddings(embeddings_file_path):
    model = KeyedVectors.load_word2vec_format(embeddings_file_path, binary=True)
    return model

In [14]:
trained_model = load_word_embeddings("/content/drive/MyDrive/ner/GoogleNews-vectors-negative300.bin")

Word dimension of the model is (300,1)

In [15]:
WORD_DIMENSIONS = 300

Define a function to vectorize tokens using word2vec

In [16]:
def word2vec_token(model,data,WORD_DIMENSIONS):
  tags=[]
  for i in data['Token']:
    try:
      tag=model[str(i)]
    except KeyError:
      tag=np.array([0 for _ in range(WORD_DIMENSIONS)])
    tags.append(tag)
  data['Token']=tags

In [17]:
word2vec_token(trained_model,data,WORD_DIMENSIONS)

In [18]:
data['NER']

0         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
7         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
9         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
10        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
                       ...              
204556    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
204558    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
204560    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
204562    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
204564    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: NER, Length: 68882, dtype: object

In [19]:
data['Token']

0         [0.037353516, -0.203125, 0.21289062, 0.2441406...
3         [-0.11816406, 0.08154297, 0.15039062, 0.031005...
7         [-0.080078125, 0.15625, -0.35351562, 0.2216796...
9         [0.3046875, 0.12402344, -0.26953125, -0.101562...
10        [-0.088378906, -0.05444336, -0.025146484, 0.21...
                                ...                        
204556    [-0.13085938, 0.024902344, -0.15039062, 0.0247...
204558    [-0.059326172, -0.007873535, -0.18261719, 0.06...
204560    [-0.37890625, -0.014526367, 0.13378906, -0.092...
204562    [-0.28515625, -0.04272461, 0.09375, 0.12597656...
204564    [0.23535156, 0.13378906, -0.043945312, 0.32421...
Name: Token, Length: 68882, dtype: object

Split the preprocessed data into **input columns** (The token, the part of speech tag and the phrase tag) and **output columns** (NER tag)

In [20]:
input_cols=['Token']
output_cols=['NER']
in_data=data[input_cols]
out_data=data[output_cols]

In [21]:
in_data

Unnamed: 0,Token
0,"[0.037353516, -0.203125, 0.21289062, 0.2441406..."
3,"[-0.11816406, 0.08154297, 0.15039062, 0.031005..."
7,"[-0.080078125, 0.15625, -0.35351562, 0.2216796..."
9,"[0.3046875, 0.12402344, -0.26953125, -0.101562..."
10,"[-0.088378906, -0.05444336, -0.025146484, 0.21..."
...,...
204556,"[-0.13085938, 0.024902344, -0.15039062, 0.0247..."
204558,"[-0.059326172, -0.007873535, -0.18261719, 0.06..."
204560,"[-0.37890625, -0.014526367, 0.13378906, -0.092..."
204562,"[-0.28515625, -0.04272461, 0.09375, 0.12597656..."


In [22]:
out_data

Unnamed: 0,NER
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
7,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
9,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
10,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
...,...
204556,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
204558,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
204560,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
204562,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


Save pkl files for train data as train_x.pkl for in data and train_y.pkl for out data

In [23]:
pkl.dump(in_data, open("/content/drive/MyDrive/ner/train_x.pkl", 'wb'))
pkl.dump(out_data, open("/content/drive/MyDrive/ner/train_y.pkl", 'wb'))
gc.collect()

3539

Create a class with the above functions for convenient preprocessing of the test and validation data

In [24]:
class preprocess:

  ner_labels=ner_labels

  model=trained_model

  word_dim=300

  def file_preprocessing(self,file_name):
    data = pd.read_csv(file_name, encoding= 'unicode_escape',sep=" ",header=None, names=["Token", "POS","Phrase","NER"])
    data.dropna(axis=0)
    wanted_POS_Tags = ['NNP', 'NN', 'NNPS', 'NNS', 'NN|SYM']
    data = data[data['POS'].isin(wanted_POS_Tags)]
    return data

  def create_OHE(self,data,col,labels_):
    tags=[]
    for i in data[col]:
      tag=[0]*10
      index=labels_.get(i,9)
      tag[index]=1
      tags.append(tag)
    data[col]=tags

  def word2vec_token(self,model,data,WORD_DIMENSIONS):
    tags=[]
    for i in data['Token']:
      try:
        tag=model[str(i)]
      except KeyError:
        tag=np.array([0 for _ in range(WORD_DIMENSIONS)])
      tags.append(tag)
    data['Token']=tags

  def save_pkl(self,file_name,x,y):
    data=self.file_preprocessing(file_name)
    self.create_OHE(data,'NER',self.ner_labels)
    self.word2vec_token(self.model,data,self.word_dim)
    input_cols=['Token']
    output_cols=['NER']
    in_data=data[input_cols]
    out_data=data[output_cols]
    in_data
    out_data
    pkl.dump(in_data, open(x, 'wb'))
    pkl.dump(out_data, open(y, 'wb'))
    gc.collect()

In [25]:
validation_file = '/content/drive/MyDrive/ner/valid.txt'
test_file = '/content/drive/MyDrive/ner/test.txt'

In [26]:
obj = preprocess()
obj.save_pkl(test_file,"/content/drive/MyDrive/ner/test_x.pkl","/content/drive/MyDrive/ner/test_y.pkl")
obj.save_pkl(validation_file,"/content/drive/MyDrive/ner/val_x.pkl","/content/drive/MyDrive/ner/val_y.pkl")

Create class WordEmbeddings to allow vectorization of new input to the model

In [27]:
class WordEmbeddings:

  trained_model = trained_model
  wORD_DIM=300

  def word2vec_token(self,model,data,WORD_DIMENSIONS):
      try:
        tag=model[str(data)]
      except KeyError:
        tag=np.array([0 for _ in range(WORD_DIMENSIONS)])
      return tag

  def __init__(self,sentence):
    self.tags=[]
    for i in sentence.strip().split():
      tag=self.word2vec_token(self,trained_model,i,self.WORD_DIM)
      self.tags.append(tag)