### Import the necessary packages and load the data

In [9]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from hmmlearn import hmm
import sklearn
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [8]:
!pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.3.0-cp39-cp39-win_amd64.whl (123 kB)
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.3.0


In [10]:
data = pd.read_csv('ner_dataset.csv', encoding = "latin1")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


### Data Preprocessing and Data Cleaning

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  47959 non-null    object
 1   Word        1048575 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [12]:
data.nunique()

Sentence #    47959
Word          35178
POS              42
Tag              17
dtype: int64

In [13]:
data.rename(columns={'Sentence #':'Sentence'}, inplace=True)

In [14]:
data = data.fillna(method = 'ffill')

In [15]:
data.isnull().sum()

Sentence    0
Word        0
POS         0
Tag         0
dtype: int64

In [16]:
print(data.head(10))
print(data.tail(10))

      Sentence           Word  POS    Tag
0  Sentence: 1      Thousands  NNS      O
1  Sentence: 1             of   IN      O
2  Sentence: 1  demonstrators  NNS      O
3  Sentence: 1           have  VBP      O
4  Sentence: 1        marched  VBN      O
5  Sentence: 1        through   IN      O
6  Sentence: 1         London  NNP  B-geo
7  Sentence: 1             to   TO      O
8  Sentence: 1        protest   VB      O
9  Sentence: 1            the   DT      O
                Sentence       Word  POS    Tag
1048565  Sentence: 47958     impact   NN      O
1048566  Sentence: 47958          .    .      O
1048567  Sentence: 47959     Indian   JJ  B-gpe
1048568  Sentence: 47959     forces  NNS      O
1048569  Sentence: 47959       said  VBD      O
1048570  Sentence: 47959       they  PRP      O
1048571  Sentence: 47959  responded  VBD      O
1048572  Sentence: 47959         to   TO      O
1048573  Sentence: 47959        the   DT      O
1048574  Sentence: 47959     attack   NN      O


In [17]:
' '.join(data[data['Sentence'] == 'Sentence: 1']['Word'].values)

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [18]:
' '.join(data[data['Sentence'] == 'Sentence: 2']['Word'].values)

'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "'

In [19]:
' '.join(data[data['Sentence'] == 'Sentence: 47959']['Word'].values)

'Indian forces said they responded to the attack'

In [20]:
data['Tag'].value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [21]:
tags = list(set(data['Tag'].values))
words = list(set(data['Word'].values))
print('Number of unique words in the dataset:', len(words))
print('Number of unique tags in the dataset:', len(tags))

Number of unique words in the dataset: 35178
Number of unique tags in the dataset: 17


### Data Preparation - Split the data into train and test set

We use **GroupShuffleSplit** to spilt the data into train and test set as using **train_test_split** will make some parts of the sentence go to train set and others to test set

In [22]:
data.columns

Index(['Sentence', 'Word', 'POS', 'Tag'], dtype='object')

In [23]:
y = data['Tag']
x = data.drop(columns='Tag')

In [24]:
split_gs = GroupShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
# n_splitsint, default=5 : Number of re-shuffling & splitting iterations.

In [25]:
x_train, x_test = next(split_gs.split(x, y, groups=x['Sentence']))

In [26]:
x.shape

(1048575, 3)

In [27]:
x_train.shape, x_test.shape

((734523,), (314052,))

In [28]:
train_data = data.iloc[x_train]
test_data = data.iloc[x_test]

In [29]:
train_data.shape, test_data.shape

((734523, 4), (314052, 4))

In [30]:
train_data.head()

Unnamed: 0,Sentence,Word,POS,Tag
24,Sentence: 2,Families,NNS,O
25,Sentence: 2,of,IN,O
26,Sentence: 2,soldiers,NNS,O
27,Sentence: 2,killed,VBN,O
28,Sentence: 2,in,IN,O


In [33]:
data_update = train_data.sample(frac=0.1, replace=False, random_state=42).reset_index(drop=True)

In [34]:
data_update.shape

(73452, 4)

In [35]:
data_update.head()

Unnamed: 0,Sentence,Word,POS,Tag
0,Sentence: 23582,the,DT,O
1,Sentence: 12377,of,IN,O
2,Sentence: 26652,.,.,O
3,Sentence: 9035,the,DT,O
4,Sentence: 16245,killed,VBN,O


In [36]:
data_update.Word = "UNKNOWN"

In [37]:
train_data['Word'].unique()

array(['Families', 'of', 'soldiers', ..., 'Soe', '3700', 'Bermel'],
      dtype=object)

In [38]:
train_data.update(data_update)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.update(data_update)


In [39]:
train_data['Word'].unique()

array(['UNKNOWN', 'the', 'illness', ..., 'Soe', '3700', 'Bermel'],
      dtype=object)

In [40]:
display(train_data.head().reset_index(drop=True))
display(train_data.tail().reset_index(drop=True))

Unnamed: 0,Sentence,Word,POS,Tag
0,Sentence: 31683,UNKNOWN,NNP,O
1,Sentence: 8392,UNKNOWN,DT,O
2,Sentence: 28070,UNKNOWN,NNS,O
3,Sentence: 40293,UNKNOWN,DT,O
4,Sentence: 44552,UNKNOWN,IN,O


Unnamed: 0,Sentence,Word,POS,Tag
0,Sentence: 47959,they,PRP,O
1,Sentence: 47959,responded,VBD,O
2,Sentence: 47959,to,TO,O
3,Sentence: 47959,the,DT,O
4,Sentence: 47959,attack,NN,O


In [42]:
print(words[:10])

['censored', 'Dragmulla', 'foster', 'analysts', 'tilt', 'Dick', 'Sirnak', 'Tanith', 'Dhabi', 'dominance']


In [43]:
word2id = {w:i for i,w in enumerate(words)}
tag2id = {t:i for i,t in enumerate(tags)}
id2tag = {i:t for i,t in enumerate(tags)}

In [44]:
print(word2id)



In [45]:
print(tag2id)

{'B-geo': 0, 'O': 1, 'I-geo': 2, 'I-tim': 3, 'I-nat': 4, 'I-per': 5, 'B-gpe': 6, 'I-gpe': 7, 'B-org': 8, 'I-eve': 9, 'B-eve': 10, 'I-art': 11, 'B-tim': 12, 'B-nat': 13, 'B-per': 14, 'I-org': 15, 'B-art': 16}


In [46]:
print(id2tag)

{0: 'B-geo', 1: 'O', 2: 'I-geo', 3: 'I-tim', 4: 'I-nat', 5: 'I-per', 6: 'B-gpe', 7: 'I-gpe', 8: 'B-org', 9: 'I-eve', 10: 'B-eve', 11: 'I-art', 12: 'B-tim', 13: 'B-nat', 14: 'B-per', 15: 'I-org', 16: 'B-art'}


### Hidden Markov Models

In [47]:
count_tags = dict(train_data['Tag'].value_counts())
print(count_tags)

{'O': 621981, 'B-geo': 26568, 'B-tim': 14380, 'B-org': 14062, 'I-per': 11998, 'B-per': 11835, 'I-org': 11799, 'B-gpe': 10912, 'I-geo': 5246, 'I-tim': 4692, 'B-art': 255, 'B-eve': 192, 'I-art': 174, 'I-eve': 149, 'B-nat': 130, 'I-gpe': 123, 'I-nat': 27}


In [48]:
train_data.head()

Unnamed: 0,Sentence,Word,POS,Tag
24,Sentence: 31683,UNKNOWN,NNP,O
25,Sentence: 8392,UNKNOWN,DT,O
26,Sentence: 28070,UNKNOWN,NNS,O
27,Sentence: 40293,UNKNOWN,DT,O
28,Sentence: 44552,UNKNOWN,IN,O


In [49]:
grouped_data = train_data.groupby('Tag')
for group_name, group_data in grouped_data:
    print(f"Group: {group_name}")
    display(group_data)

Group: B-art


Unnamed: 0,Sentence,Word,POS,Tag
13288,Sentence: 10857,UNKNOWN,IN,B-art
14210,Sentence: 21780,UNKNOWN,NNP,B-art
15551,Sentence: 46145,UNKNOWN,NNP,B-art
20690,Sentence: 8926,UNKNOWN,NN,B-art
25429,Sentence: 26264,UNKNOWN,NNP,B-art
...,...,...,...,...
1027190,Sentence: 46974,Arc,NNP,B-art
1033280,Sentence: 47247,Google,NNP,B-art
1041052,Sentence: 47608,Zim,NNP,B-art
1041448,Sentence: 47627,al-Jazeera,NNP,B-art


Group: B-eve


Unnamed: 0,Sentence,Word,POS,Tag
10337,Sentence: 26807,UNKNOWN,NNP,B-eve
21751,Sentence: 9008,UNKNOWN,NNP,B-eve
26762,Sentence: 1962,UNKNOWN,NNP,B-eve
35414,Sentence: 32870,UNKNOWN,NNP,B-eve
36708,Sentence: 8482,UNKNOWN,NNS,B-eve
...,...,...,...,...
1027168,Sentence: 46973,Armistice,NNP,B-eve
1027179,Sentence: 46974,World,NNP,B-eve
1041907,Sentence: 47649,II,NNP,B-eve
1041940,Sentence: 47651,World,NNP,B-eve


Group: B-geo


Unnamed: 0,Sentence,Word,POS,Tag
30,Sentence: 18569,UNKNOWN,NNP,B-geo
38,Sentence: 8410,UNKNOWN,NN,B-geo
103,Sentence: 40919,UNKNOWN,NNP,B-geo
125,Sentence: 42300,UNKNOWN,NNP,B-geo
130,Sentence: 46773,UNKNOWN,NNP,B-geo
...,...,...,...,...
1048181,Sentence: 47941,southern,JJ,B-geo
1048221,Sentence: 47943,Burma,NNP,B-geo
1048456,Sentence: 47953,Bermel,NNP,B-geo
1048459,Sentence: 47953,Paktika,NNP,B-geo


Group: B-gpe


Unnamed: 0,Sentence,Word,POS,Tag
265,Sentence: 10972,UNKNOWN,JJ,B-gpe
598,Sentence: 1048,UNKNOWN,JJ,B-gpe
604,Sentence: 35255,UNKNOWN,JJ,B-gpe
893,Sentence: 42188,UNKNOWN,JJ,B-gpe
1003,Sentence: 45570,UNKNOWN,NNP,B-gpe
...,...,...,...,...
1048465,Sentence: 47953,Afghan,JJ,B-gpe
1048501,Sentence: 47955,Indian,JJ,B-gpe
1048508,Sentence: 47955,Pakistani,JJ,B-gpe
1048521,Sentence: 47956,Indian,JJ,B-gpe


Group: B-nat


Unnamed: 0,Sentence,Word,POS,Tag
4075,Sentence: 40041,UNKNOWN,NNP,B-nat
14665,Sentence: 11690,UNKNOWN,NNP,B-nat
16263,Sentence: 39034,UNKNOWN,NNP,B-nat
21853,Sentence: 32344,UNKNOWN,NNP,B-nat
30376,Sentence: 45126,UNKNOWN,NNP,B-nat
...,...,...,...,...
986989,Sentence: 45125,Marburg,NNP,B-nat
987023,Sentence: 45126,Marburg,NNP,B-nat
997163,Sentence: 45597,2007,NNP,B-nat
1005549,Sentence: 45978,AIDS,NNP,B-nat


Group: B-org


Unnamed: 0,Sentence,Word,POS,Tag
45,Sentence: 38559,UNKNOWN,NNP,B-org
49,Sentence: 41902,UNKNOWN,NNP,B-org
88,Sentence: 1168,UNKNOWN,NNP,B-org
92,Sentence: 17780,UNKNOWN,JJ,B-org
129,Sentence: 27387,UNKNOWN,NNP,B-org
...,...,...,...,...
1048289,Sentence: 47945,Thura,NNP,B-org
1048338,Sentence: 47947,General,NNP,B-org
1048376,Sentence: 47950,Joint,NNP,B-org
1048450,Sentence: 47953,Taleban,NNP,B-org


Group: B-per


Unnamed: 0,Sentence,Word,POS,Tag
93,Sentence: 42643,UNKNOWN,NNP,B-per
108,Sentence: 4312,UNKNOWN,NNP,B-per
115,Sentence: 34476,UNKNOWN,NNP,B-per
222,Sentence: 30474,UNKNOWN,NNP,B-per
254,Sentence: 43718,UNKNOWN,NNP,B-per
...,...,...,...,...
1048156,Sentence: 47940,David,NNP,B-per
1048191,Sentence: 47941,Richards,NNP,B-per
1048209,Sentence: 47942,Richards,NNP,B-per
1048229,Sentence: 47943,Minister,NNP,B-per


Group: B-tim


Unnamed: 0,Sentence,Word,POS,Tag
43,Sentence: 22331,UNKNOWN,NNP,B-tim
112,Sentence: 24312,UNKNOWN,NNP,B-tim
256,Sentence: 17485,UNKNOWN,CD,B-tim
337,Sentence: 45719,UNKNOWN,NNP,B-tim
460,Sentence: 11893,UNKNOWN,NNP,B-tim
...,...,...,...,...
1048226,Sentence: 47943,today,NN,B-tim
1048333,Sentence: 47947,October,NNP,B-tim
1048371,Sentence: 47949,Afghanistan,NNP,B-tim
1048381,Sentence: 47950,Monday,NNP,B-tim


Group: I-art


Unnamed: 0,Sentence,Word,POS,Tag
2669,Sentence: 28696,UNKNOWN,NNP,I-art
12964,Sentence: 11286,UNKNOWN,IN,I-art
18643,Sentence: 33786,UNKNOWN,NNP,I-art
22885,Sentence: 40538,UNKNOWN,NNP,I-art
25277,Sentence: 30419,UNKNOWN,NNP,I-art
...,...,...,...,...
1022277,Sentence: 46750,Day,NNP,I-art
1027191,Sentence: 46974,de,NNP,I-art
1027192,Sentence: 46974,Triomphe,NNP,I-art
1041053,Sentence: 47608,Asia,NNP,I-art


Group: I-eve


Unnamed: 0,Sentence,Word,POS,Tag
3150,Sentence: 39078,UNKNOWN,NNP,I-eve
10168,Sentence: 21776,UNKNOWN,NNP,I-eve
13227,Sentence: 11567,UNKNOWN,NNP,I-eve
29782,Sentence: 46749,UNKNOWN,NNP,I-eve
30344,Sentence: 5510,UNKNOWN,NNP,I-eve
...,...,...,...,...
1027169,Sentence: 46973,Day,NNP,I-eve
1027180,Sentence: 46974,War,NNP,I-eve
1027181,Sentence: 46974,I,NNP,I-eve
1041941,Sentence: 47651,War,NNP,I-eve


Group: I-geo


Unnamed: 0,Sentence,Word,POS,Tag
97,Sentence: 1362,UNKNOWN,NNPS,I-geo
905,Sentence: 11334,UNKNOWN,NNP,I-geo
1028,Sentence: 41070,UNKNOWN,NNP,I-geo
1210,Sentence: 43841,UNKNOWN,NNP,I-geo
1312,Sentence: 26515,UNKNOWN,NNP,I-geo
...,...,...,...,...
1047162,Sentence: 47887,Rouge,NNP,I-geo
1047365,Sentence: 47898,India,NNP,I-geo
1047948,Sentence: 47926,Afghanistan,NNP,I-geo
1048073,Sentence: 47934,Afghanistan,NNP,I-geo


Group: I-gpe


Unnamed: 0,Sentence,Word,POS,Tag
12277,Sentence: 3204,UNKNOWN,NNP,I-gpe
19856,Sentence: 20428,UNKNOWN,NNP,I-gpe
41358,Sentence: 3675,UNKNOWN,JJ,I-gpe
58292,Sentence: 23363,UNKNOWN,JJ,I-gpe
61769,Sentence: 43956,UNKNOWN,JJ,I-gpe
...,...,...,...,...
1015200,Sentence: 46423,Serbs,NNS,I-gpe
1015213,Sentence: 46423,Serb,JJ,I-gpe
1038033,Sentence: 47463,Croat,JJ,I-gpe
1038535,Sentence: 47487,Serb,JJ,I-gpe


Group: I-nat


Unnamed: 0,Sentence,Word,POS,Tag
47926,Sentence: 34647,UNKNOWN,NNP,I-nat
107418,Sentence: 4883,Katrina,NNP,I-nat
110107,Sentence: 5011,Katrina,NNP,I-nat
110169,Sentence: 5014,Katrina,NNP,I-nat
131770,Sentence: 6017,Katrina,NNP,I-nat
193933,Sentence: 8882,flu,NN,I-nat
226400,Sentence: 10362,disease,NN,I-nat
226407,Sentence: 10363,Two,CD,I-nat
226408,Sentence: 10363,diabetes,NNS,I-nat
250023,Sentence: 11438,Rita,NNP,I-nat


Group: I-org


Unnamed: 0,Sentence,Word,POS,Tag
70,Sentence: 33896,UNKNOWN,NNP,I-org
74,Sentence: 6926,UNKNOWN,",",I-org
100,Sentence: 32160,UNKNOWN,NNP,I-org
127,Sentence: 17085,UNKNOWN,NNP,I-org
238,Sentence: 28411,UNKNOWN,NNP,I-org
...,...,...,...,...
1048340,Sentence: 47947,Win,NNP,I-org
1048377,Sentence: 47950,Coordination,NNP,I-org
1048378,Sentence: 47950,and,CC,I-org
1048379,Sentence: 47950,Monitoring,NNP,I-org


Group: I-per


Unnamed: 0,Sentence,Word,POS,Tag
106,Sentence: 2393,UNKNOWN,NNP,I-per
116,Sentence: 39633,UNKNOWN,NNP,I-per
124,Sentence: 15431,UNKNOWN,NNP,I-per
246,Sentence: 22829,UNKNOWN,NNP,I-per
343,Sentence: 30836,UNKNOWN,NNP,I-per
...,...,...,...,...
1048135,Sentence: 47939,Karzai,NNP,I-per
1048157,Sentence: 47940,Richards,NNP,I-per
1048230,Sentence: 47943,Khin,NNP,I-per
1048231,Sentence: 47943,Nyunt,NNP,I-per


Group: I-tim


Unnamed: 0,Sentence,Word,POS,Tag
75,Sentence: 3864,UNKNOWN,CD,I-tim
522,Sentence: 27757,UNKNOWN,CD,I-tim
607,Sentence: 7878,UNKNOWN,NN,I-tim
769,Sentence: 27182,UNKNOWN,CC,I-tim
944,Sentence: 39400,UNKNOWN,CD,I-tim
...,...,...,...,...
1046766,Sentence: 47869,1992,CD,I-tim
1046767,Sentence: 47869,and,CC,I-tim
1046768,Sentence: 47869,2000,CD,I-tim
1047132,Sentence: 47886,18,CD,I-tim


Group: O


Unnamed: 0,Sentence,Word,POS,Tag
24,Sentence: 31683,UNKNOWN,NNP,O
25,Sentence: 8392,UNKNOWN,DT,O
26,Sentence: 28070,UNKNOWN,NNS,O
27,Sentence: 40293,UNKNOWN,DT,O
28,Sentence: 44552,UNKNOWN,IN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [54]:
count_tags_to_words = train_data.groupby('Tag').apply(lambda x: x.groupby('Word')['Tag'].count().to_dict()).to_dict()
display(count_tags_to_words)

{'B-art': {'300': 2,
  '@dalailama': 1,
  'A380s': 1,
  'AK-47': 1,
  'AM': 1,
  'Abu': 2,
  'Ai': 2,
  'Akash': 1,
  'Al': 1,
  'Anglo-French': 1,
  'Ansari': 1,
  'Arabic': 2,
  'Arc': 1,
  'Ariana': 1,
  'Atlantis': 1,
  'Axum': 1,
  'Azeri': 1,
  'BPA': 1,
  'Balad': 1,
  'Basilica': 1,
  'Beijing': 2,
  'Best': 1,
  'Bisphenol': 1,
  'Blades': 1,
  'Blueberry': 1,
  'Boeing': 1,
  'British': 1,
  'Buddhism': 1,
  'Cajun': 1,
  'Camp': 3,
  'Canal': 4,
  'Capitol': 1,
  'China': 1,
  'Chinook': 1,
  'Chung': 1,
  'Creole': 1,
  'Culture': 1,
  'Cy': 1,
  'Da': 1,
  'Date': 1,
  'Dejarte': 1,
  'Democrat': 1,
  'Destiny': 1,
  'Dextre': 2,
  'Didadgah': 1,
  'Dignity': 1,
  'Doha': 1,
  'Economics': 1,
  'El~Commercio': 1,
  'Emergency': 1,
  'Emmy': 1,
  'Endeavor': 1,
  'Endeavour': 1,
  'English': 10,
  'Facebook': 6,
  'Ferrari': 1,
  'For': 1,
  'Forbes': 3,
  'Fort': 1,
  'Four': 1,
  'Frankenstadion': 1,
  'French': 2,
  'GDP': 4,
  'Georgia': 1,
  'German': 1,
  'Google': 2,

In [55]:
count_init_tags = dict(train_data.groupby('Sentence').first().Tag.value_counts())
display(count_init_tags)

{'O': 26999,
 'B-geo': 1434,
 'B-per': 1059,
 'B-org': 982,
 'B-gpe': 803,
 'B-tim': 553,
 'I-per': 410,
 'I-org': 391,
 'I-geo': 188,
 'I-tim': 167,
 'B-art': 9,
 'B-eve': 8,
 'I-art': 8,
 'B-nat': 6,
 'I-eve': 5,
 'I-gpe': 4,
 'I-nat': 1}

In [56]:
count_tags_to_next_tags = np.zeros((len(tags), len(tags)),dtype=int)
print(count_tags_to_next_tags.shape)

(17, 17)


In [57]:
train_data['Sentence'][:5]

24    Sentence: 31683
25     Sentence: 8392
26    Sentence: 28070
27    Sentence: 40293
28    Sentence: 44552
Name: Sentence, dtype: object

In [58]:
sentences = list(train_data.Sentence)
print(sentences[:5])

['Sentence: 31683', 'Sentence: 8392', 'Sentence: 28070', 'Sentence: 40293', 'Sentence: 44552']


In [59]:
pos = list(train_data.Tag)
print(pos[:10])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O']


In [60]:
for i in range(len(sentences)):
    if (i>0) and (sentences[i]==sentences[i-1]):
        prevtagid = tag2id[pos[i - 1]]
        nexttagid = tag2id[pos[i]]
        count_tags_to_next_tags[prevtagid][nexttagid] += 1

In [61]:
print(count_tags_to_next_tags[0])

[    0 19648  4349     0     0     0   102     0    36     0     1     0
   507     0    74     0     0]


In [62]:
mystartprob = np.zeros((len(tags),))
mytransmat = np.zeros((len(tags), len(tags)))
myemissionprob = np.zeros((len(tags), len(words)))
num_sentences = sum(count_init_tags.values())
sum_tags_to_next_tags = np.sum(count_tags_to_next_tags, axis=1)
for tag, tagid in tag2id.items():
    floatCountTag = float(count_tags.get(tag, 0))
    mystartprob[tagid] = count_init_tags.get(tag, 0) / num_sentences
    for word, wordid in word2id.items():
        myemissionprob[tagid][wordid]= count_tags_to_words.get(tag, {}).get(word, 0) / floatCountTag
    for tag2, tagid2 in tag2id.items():
        mytransmat[tagid][tagid2]= count_tags_to_next_tags[tagid][tagid2] / sum_tags_to_next_tags[tagid]

In [63]:
print(mystartprob.shape)
print(mytransmat.shape)
print(myemissionprob.shape)

(17,)
(17, 17)
(17, 29219)


In [64]:
model = hmm.MultinomialHMM(n_components=len(tags), algorithm='viterbi', random_state=42)
model.startprob_ = mystartprob
model.transmat_ = mytransmat
model.emissionprob_ = myemissionprob

MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


In [65]:
test_data.loc[~test_data['Word'].isin(words), 'Word'] = 'UNKNOWN'   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.loc[~test_data['Word'].isin(words), 'Word'] = 'UNKNOWN'


In [66]:
word_test = list(test_data.Word)
samples = []
for i, val in enumerate(word_test):
    samples.append([word2id[val]])

In [67]:
lengths = []
count = 0
sentences = list(test_data.Sentence)
for i in range(len(sentences)) :
    if (i > 0) and (sentences[i] == sentences[i - 1]):
        count += 1
    elif i > 0:
        lengths.append(count)
        count = 1
    else:
        count = 1

In [68]:
len(lengths)

14387

In [None]:
ner_predict = model.predict(samples, lengths)