In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("../NER_Dataset/ner-gmb/train.txt", sep = " ", header = None, names = ['token', 'postag', 'word', 'nertag'], skip_blank_lines=False)
df_list = np.split(df, df[df.isnull().all(1)].index) 
#add sentence number to each dataframe and then combine. Also for each dataframe remove the first row

def remove_first_row(df):
    '''removes the first row after the heading from the dataframe - used for removing NaN row'''
    df.drop([0], inplace = True)
    return df
    
def add_sentence_number(df, i):
    '''adds a column to df in which all rows have value i'''
    df['sentence'] = i
    return df
    

new_dflist = []
for i, df in enumerate(df_list[1:]):
    df.reset_index(drop=True, inplace = True)
    df = remove_first_row(df)
    df = add_sentence_number(df, i)
    new_dflist.append(df)
len(new_dflist)

37207

In [6]:
def combine_dataframes(df_list):

    return pd.concat(df_list)


df = combine_dataframes(new_dflist)
df.reset_index(drop=True, inplace = True)

In [7]:
df

Unnamed: 0,token,postag,word,nertag,sentence
0,the,DT,The,O,0
1,anatolia,NNP,Anatolia,B-org,0
2,news,NN,news,O,0
3,agency,NN,agency,O,0
4,say,VBD,said,O,0
...,...,...,...,...,...
813586,still,RB,still,O,37205
813587,launch,VB,launch,O,37205
813588,occasional,JJ,occasional,O,37205
813589,attack,NNS,attacks,O,37205


In [20]:
#making the vocabulary
words = list(df.token.unique())
print(words[:10])
print(len(words), "total words")

tags = list(df.nertag.unique())
print(tags)
print(len(tags), "total tags")

['the', 'anatolia', 'news', 'agency', 'say', 'saturday', 'that', 'officer', '-', 'two']
24137 total words
['O', 'B-org', 'B-tim', 'B-geo', 'B-gpe', 'B-per', 'I-per', 'I-org', nan, 'I-geo', 'I-tim', 'B-eve', 'I-eve', 'I-gpe', 'B-nat', 'I-nat', 'B-art', 'I-art']
18 total tags


In [None]:
# mapping words and tags to indices
wordnums = {word : i for i, w in enumerate(words)}
tagnums = {tags : i for }

# BiLSTM Model

In [5]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, v, num_tags):
        super(BiLSTM, self).__init__()

        self.wordembed = nn.Embedding(v, embedding_size)
        self.bilstm = nn.LSTM(embedding_size,hidden_size, biderectional = True)
        self.linear = nn.Linear(hidden_size, num_tags)
        
    def forward(self, x):
        out = self.wordembed(x) # x is of size(seq_len, batch, input_size)
        out, (h,c) = self.bilstm(out) # after this step out has dimension(seq_len, batch, hidden_size)
        out = out.view(-1, out.shape[2])
        out = F.log_softmax(self.linear(out), dim = 1)
        
        return out
        
        