In [1]:
!pip install gensim



In [2]:
import numpy as np
import gensim.downloader
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping
from tensorflow.keras import layers
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report
keras.utils.set_random_seed(42)

In [4]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('key')
os.environ["KAGGLE_USERNAME"] = userdata.get('username')

In [5]:
!kaggle datasets download -d abhinavwalia95/entity-annotated-corpus

Dataset URL: https://www.kaggle.com/datasets/abhinavwalia95/entity-annotated-corpus
License(s): DbCL-1.0
Downloading entity-annotated-corpus.zip to /content
 19% 5.00M/26.4M [00:00<00:00, 37.6MB/s]
100% 26.4M/26.4M [00:00<00:00, 120MB/s] 


In [6]:
!mkdir data

In [7]:
!unzip /content/entity-annotated-corpus.zip -d /content/data

Archive:  /content/entity-annotated-corpus.zip
  inflating: /content/data/ner.csv   
  inflating: /content/data/ner_dataset.csv  


In [8]:
df=pd.read_csv("/content/data/ner.csv", encoding='unicode_escape', on_bad_lines='skip')
df = df[["sentence_idx", "word","tag"]]
df  = df.dropna(subset=['sentence_idx']).reset_index(drop=True)
df['sentence_idx'] = df['sentence_idx'].apply(int)
df['word'] = df['word'].fillna("xxxxx").apply(lambda x: x.lower())
df['tag'] = df['tag'].apply(lambda x: x if x == 'O' else x[2:])
print(df.shape)
df.head()

(1050794, 3)


Unnamed: 0,sentence_idx,word,tag
0,1,thousands,O
1,1,of,O
2,1,demonstrators,O
3,1,have,O
4,1,marched,O


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050794 entries, 0 to 1050793
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   sentence_idx  1050794 non-null  int64 
 1   word          1050794 non-null  object
 2   tag           1050794 non-null  object
dtypes: int64(1), object(2)
memory usage: 24.1+ MB


In [10]:
print("Number of sentances      = {0}".format(df['sentence_idx'].nunique()))
print("Number of unique words:  = {0}".format(len(list(set(df["word"].values)))))
print("Number of unique tags :  = {0}".format(len(list(set(df["tag"].values)))))
print("unique tags           :  = {0}".format((list(set(df["tag"].values)))))

Number of sentances      = 35177
Number of unique words:  = 27420
Number of unique tags :  = 9
unique tags           :  = ['O', 'per', 'geo', 'art', 'gpe', 'org', 'tim', 'eve', 'nat']


In [11]:
fig = px.histogram(df, x=df['sentence_idx'].value_counts().values ,nbins=50)
fig.update_layout(
    title="Sentences",
    xaxis_title="Sentences with number of words",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

In [12]:
words = pd.DataFrame(df.groupby('sentence_idx')['word'].apply(list))
tags = df.groupby('sentence_idx')['tag'].apply(list)
words['tag'] = tags
words['sentence'] = words['word'].apply(lambda x: ' '.join(x))
words['tag_combine'] = words['tag'].apply(lambda x: ' '.join(x))
words.head()

Unnamed: 0_level_0,word,tag,sentence,tag_combine
sentence_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"[thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, geo, O, O, O, O, O, geo, O,...",thousands of demonstrators have marched throug...,O O O O O O geo O O O O O geo O O O O O gpe O ...
2,"[families, of, soldiers, killed, in, the, conf...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",families of soldiers killed in the conflict jo...,O O O O O O O O O O O O O O O O O O per O O O ...
3,"[they, marched, from, the, houses, of, parliam...","[O, O, O, O, O, O, O, O, O, O, O, geo, geo, O,...",they marched from the houses of parliament to ...,O O O O O O O O O O O geo geo O O O O O O O O ...
4,"[police, put, the, number, of, marchers, at, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","police put the number of marchers at 10,000 wh...",O O O O O O O O O O O O O O O O O O O O O O O ...
5,"[the, protest, comes, on, the, eve, of, the, a...","[O, O, O, O, O, O, O, O, O, O, O, geo, O, O, o...",the protest comes on the eve of the annual con...,O O O O O O O O O O O geo O O org org O O O gp...


In [13]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [14]:
words_vectors = gensim.downloader.load('glove-wiki-gigaword-100')



In [15]:
def encode(tags,model):
  enTags=[]
  for tag in tags:
    enTags.append(model.transform(tag))
  return enTags
def word2vec(sens,model):
  allVecs=[]
  for sen in sens:
    vecs=[]
    for i in sen:
      if(model.has_index_for(i)):
        vecs.append(model[i])
      else:
        vecs.append(model[0])
    allVecs.append(vecs)
  return allVecs

In [16]:
max_sentence=df['sentence_idx'].value_counts().max()
vector_size=words_vectors.vector_size
nlabels=df["tag"].nunique()

In [17]:
sentences=words['word']
tags=words['tag']
le=LabelEncoder()
le.fit(df["tag"].values)

In [18]:
vecs=word2vec(sentences,words_vectors)
tagsEn=encode(tags,le)

In [19]:
padded_vecs=tf.keras.utils.pad_sequences(vecs,maxlen=max_sentence,dtype='float32',padding='post',value=0.0 ,truncating='post')
padded_tags=tf.keras.utils.pad_sequences(tagsEn,maxlen=max_sentence,padding='post',value=0,truncating='post')

In [20]:
print(padded_vecs.shape)
print(padded_tags.shape)

(35177, 140, 100)
(35177, 140)


In [21]:
x_train,x_test,y_train,y_test=train_test_split(padded_vecs, padded_tags, test_size=0.33, random_state=42)

In [22]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(23568, 140, 100)
(23568, 140)
(11609, 140, 100)
(11609, 140)


In [26]:
model =keras.Sequential([
    layers.Input(shape=(max_sentence,vector_size)),
    layers.Masking(mask_value=0),
    layers.SpatialDropout1D(0.1),
    layers.LSTM(128,return_sequences=True, recurrent_dropout=0.2),
    layers.LSTM(units=128,return_sequences=True, recurrent_dropout=0.2),
    layers.TimeDistributed(layers.Dense(nlabels,activation='softmax')),
])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_3 (Masking)         (None, 140, 100)          0         
                                                                 
 spatial_dropout1d_3 (Spati  (None, 140, 100)          0         
 alDropout1D)                                                    
                                                                 
 lstm (LSTM)                 (None, 140, 128)          117248    
                                                                 
 lstm_1 (LSTM)               (None, 140, 128)          131584    
                                                                 
 time_distributed (TimeDist  (None, 140, 9)            1161      
 ributed)                                                        
                                                                 
Total params: 249993 (976.54 KB)
Trainable params: 24999

In [27]:
early_stopping = EarlyStopping(monitor='val_accuracy',patience=5,verbose=0,mode='max',restore_best_weights=True)
import gc
gc.collect()
class GarbageCollectorCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        gc.collect()
callbacks = [early_stopping,GarbageCollectorCallback()]
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history=model.fit(x_train, y_train, epochs=20, batch_size=64  , validation_split = 0.2,    verbose = 1,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [28]:
model.evaluate(x_test, np.array(y_test))



[0.12700144946575165, 0.9598154425621033]

In [32]:
bimodel =keras.Sequential([
    layers.Input(shape=(max_sentence,vector_size)),
    layers.Masking(mask_value=0),
    layers.SpatialDropout1D(0.1),
    layers.Bidirectional(layers.LSTM(128,return_sequences=True, recurrent_dropout=0.2)),
    layers.Bidirectional(layers.LSTM(units=128,return_sequences=True, recurrent_dropout=0.2)),
    layers.TimeDistributed(layers.Dense(nlabels,activation='softmax')),
])
bimodel.summary()



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_5 (Masking)         (None, 140, 100)          0         
                                                                 
 spatial_dropout1d_5 (Spati  (None, 140, 100)          0         
 alDropout1D)                                                    
                                                                 
 bidirectional_2 (Bidirecti  (None, 140, 256)          234496    
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 140, 256)          394240    
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 140, 9)            2313      
 stributed)                                           

In [None]:
early_stopping = EarlyStopping(monitor='val_accuracy',patience=5,verbose=0,mode='max',restore_best_weights=True)
import gc
gc.collect()
class GarbageCollectorCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        gc.collect()
callbacks = [early_stopping,GarbageCollectorCallback()]
# Compile the model
bimodel.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
bihistory=bimodel.fit(x_train, y_train, epochs=20, batch_size=64  , validation_split = 0.2,    verbose = 1,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

In [None]:
bimodel.evaluate(x_test, np.array(y_test))