In [1]:
import numpy as np
from hmm_utils import HMM
from params import *

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import random

#some other libraries
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from typing import List

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score

[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


## Perprocessing

In [2]:
data = pd.read_csv("data/ner.csv", encoding = "latin1")
data = data.fillna(method="ffill")
data = data.rename(columns={'Sentence #': 'sentence'})
data.head(5)

  data = data.fillna(method="ffill")


Unnamed: 0,sentence,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
def pre_processing(text_column):
    # lowercase all text in the column
    text_column = text_column.str.lower()

    # replacing numbers with NUM token
    text_column = text_column.str.replace(r'\d+', 'NUM')

    # removing stopwords
    stop_words = set(stopwords.words('english'))
    text_column = text_column.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    return text_column

data_pre_precessed = pre_processing(data.Word)
#creating new dataframe with preprocessed word as a column
data_processed = data
data_processed['Word'] = data_pre_precessed

#removing the rows where word is empty
data_processed = data_processed[(data_processed['Word'] != '') | (data_processed['Word'].isna())]

In [4]:
tags = list(set(data.POS.values))  # Unique POS tags in the dataset
words = list(set(data.Word.values))  # Unique words in the dataset
len(tags), len(words)

words1 = list(set(data_processed.Word.values))  # Unique words in the dataset
len(words1)

31682

In [5]:
# Most common words
N_w = 300
common_words = data_processed['Word'] .value_counts().sort_values(ascending=False)[:N_w].index
data_reduced = data_processed[data_processed['Word'].isin(common_words)]

tags = list(set(data_reduced.POS.values))  # Unique POS tags in the dataset
words = list(set(data_reduced.Word.values))  # Unique words in the dataset
len(tags), len(words)


(29, 300)

In [6]:
words = list(set(data_reduced.Word.values))
# Convert words and tags into numbers
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}
id2tag = {i: t for i, t in enumerate(tags)}
len(tags), len(words)

(29, 300)

## Create HMM manually

In [7]:
count_tags = dict(data_reduced.POS.value_counts())  # Total number of POS tags in the dataset
# Now let's create the tags to words count
count_tags_to_words = data_reduced.groupby(['POS']).apply(
    lambda grp: grp.groupby('Word')['POS'].count().to_dict()).to_dict()
# We shall also collect the counts for the first tags in the sentence
count_init_tags = dict(data_reduced.groupby('sentence').first().POS.value_counts())

# Create a mapping that stores the frequency of transitions in tags to it's next tags
count_tags_to_next_tags = np.zeros((len(tags), len(tags)), dtype=int)
sentences = list(data_reduced.sentence)
pos = list(data_reduced.POS)
for i in tqdm(range(len(sentences)), position=0, leave=True):
    if (i > 0) and (sentences[i] == sentences[i - 1]):
        prevtagid = tag2id[pos[i - 1]]
        nexttagid = tag2id[pos[i]]
        count_tags_to_next_tags[prevtagid][nexttagid] += 1

100%|██████████████████████████████| 329086/329086 [00:00<00:00, 2382540.51it/s]


In [8]:
startprob = np.zeros((len(tags),))
transmat = np.zeros((len(tags), len(tags)))
emissionprob = np.zeros((len(tags), len(words)))
num_sentences = sum(count_init_tags.values())
sum_tags_to_next_tags = np.sum(count_tags_to_next_tags, axis=1)
for tag, tagid in tqdm(tag2id.items(), position=0, leave=True):
    floatCountTag = float(count_tags.get(tag, 0))
    startprob[tagid] = count_init_tags.get(tag, 0) / num_sentences
    for word, wordid in word2id.items():
        emissionprob[tagid][wordid] = count_tags_to_words.get(tag, {}).get(word, 0) / floatCountTag
    for tag2, tagid2 in tag2id.items():
        transmat[tagid][tagid2] = count_tags_to_next_tags[tagid][tagid2] / sum_tags_to_next_tags[tagid]

100%|████████████████████████████████████████| 29/29 [00:00<00:00, 12752.65it/s]


# Build HMM

In [9]:
cutoff = 0.001
startprob = startprob +  cutoff
startprob = startprob/ np.sum(startprob)
##
transmat =  transmat + cutoff
transmat = transmat / np.sum(transmat, axis=1)
np.sum(transmat, axis=1)
##
emissionprob =  emissionprob + cutoff
emissionprob = emissionprob / np.sum(emissionprob, axis=1).reshape(-1,1)
np.sum(emissionprob, axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [10]:
hmm_n = HMM(len(tags), len(words))
hmm_n.startprob_ = startprob
hmm_n.transmat_ = transmat
hmm_n.emissionprob_ = emissionprob

# Attack HMM - APS

In [11]:
from solvers.aps_gibbs_class import aps_gibbs
from attackers.decoding_attacker import dec_attacker

In [12]:
T  = 53
n_obs = 300
n_hidden = 29
w1 = 0.0
w2 = 100.0 
k_value = 1000000.0
cool = np.arange(500,501, 1)
seq = 22*np.ones(T).astype(int)
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]

###
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]
    
A =  X.astype(int)

_, y_pred = hmm_n.nu(A[0])
y_pred

y = np.zeros(len(data_reduced.POS[:T]))
for i in range(len(data_reduced.POS[:T])):
    y[i] = tag2id[data_reduced.POS[:T].iloc[i]]
###

A =  X.astype(int)

In [13]:
rho_probs = np.ones(n_obs)
att = dec_attacker(hmm_n.startprob_ , hmm_n.transmat_, hmm_n.emissionprob_, rho_probs,
         A.T, w1, w2, seq, k_value)

In [14]:
find_sol = aps_gibbs(att, cool, burnin=0.1, verbose=True)
sol, samples = find_sol.iterate(simulation_seconds=None)

Percentage completed: 0.0
Current state [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0
1


KeyboardInterrupt: 

In [None]:
attack_obs = att.attack_X(np.ones_like(sol), sol)
attack_obs = attack_obs.squeeze().astype(int)
V, seq = hmm_n.nu(attack_obs)
probs, opt_s = hmm_n.nu(seq)

In [None]:
opt_s

In [None]:
np.argmax(sol, axis=1)

In [None]:
A[0] == np.argmax(sol, axis=1)

In [48]:
#word2id

# Attack HMM - RS

In [24]:
from solvers.nn_RS.nn_RS import nn_RS
from attackers.decoding_attacker import dec_attacker

In [25]:
T  = 53
n_obs = 300
n_hidden = 29
w1 = 0.0
w2 = 100.0 
k_value = 1000000.0
seq = 22*np.ones(T).astype(int)
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]

###
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]
    
A =  X.astype(int)

_, y_pred = hmm_n.nu(A[0])
y_pred

y = np.zeros(len(data_reduced.POS[:T]))
for i in range(len(data_reduced.POS[:T])):
    y[i] = tag2id[data_reduced.POS[:T].iloc[i]]
###

A =  X.astype(int)

In [26]:
rho_probs = np.ones(n_obs)
att = dec_attacker(hmm_n.startprob_ , hmm_n.transmat_, hmm_n.emissionprob_, rho_probs,
         A.T, w1, w2, seq, k_value)

In [27]:
find_sol = nn_RS(att, "SA", RS_iters=10000, mcts_iters=10, sa_iters=10, eps=0.05, lr=0.005, verbose=True)
sol, samples = find_sol.iterate(simulation_seconds=None)

Percentage completed: 0.0
Best value: 
0.0
Percentage completed: 0.5
Best value: 
101.0
Percentage completed: 1.0
Best value: 
101.0
Percentage completed: 1.5
Best value: 
201.0
Percentage completed: 2.0
Best value: 
201.0
Percentage completed: 2.5
Best value: 
201.0
Percentage completed: 3.0
Best value: 
201.0
Percentage completed: 3.5
Best value: 
201.0
Percentage completed: 4.0
Best value: 
301.0
Percentage completed: 4.5
Best value: 
401.0
Percentage completed: 5.0
Best value: 
501.0
Percentage completed: 5.5
Best value: 
501.0
Percentage completed: 6.0
Best value: 
601.0
Percentage completed: 6.5
Best value: 
601.0
Percentage completed: 7.0
Best value: 
701.0
Percentage completed: 7.5
Best value: 
701.0
Percentage completed: 8.0
Best value: 
701.0
Percentage completed: 8.5
Best value: 
701.0
Percentage completed: 9.0
Best value: 
701.0
Percentage completed: 9.5
Best value: 
701.0
Percentage completed: 10.0
Best value: 
701.0
Percentage completed: 10.5
Best value: 
701.0
Percentage

In [28]:
attack_obs = att.attack_X(np.ones_like(sol), sol)
attack_obs = attack_obs.squeeze().astype(int)
V, seq = hmm_n.nu(attack_obs)
probs, opt_s = hmm_n.nu(seq)

In [29]:
opt_s

array([ 2,  0,  1,  0,  2,  0, 13,  0, 13,  0,  1,  0,  1, 13,  0,  0, 13,
        0,  0, 13,  0,  0, 13,  0,  1,  0,  0, 13,  0,  1,  1,  1,  0,  2,
        0, 13,  0,  2,  1,  1,  1,  1, 13,  0,  0,  0,  0,  0,  2,  1,  1,
        1, 13])

In [30]:
att.expected_utility(sol)

5301.0

In [33]:
len(opt_s)

53

In [32]:
(A[0] == np.argmax(sol, axis=1)).sum()

53

In [34]:
A[0]

array([ 80, 182,  82, 170, 146, 161, 281,  62, 187, 172, 268,  69, 279,
        47, 172, 172, 281, 172, 183, 281, 114,  69, 281, 231, 142,  56,
       116, 281,  56, 231, 142,  82, 170, 146, 161, 281, 131, 276, 121,
        22, 121, 121, 281,  68, 244, 232,  42, 149, 105, 265,  14, 142,
       281])

In [35]:
np.argmax(sol, axis=1)

array([ 80, 182,  82, 170, 146, 161, 281,  62, 187, 172, 268,  69, 279,
        47, 172, 172, 281, 172, 183, 281, 114,  69, 281, 231, 142,  56,
       116, 281,  56, 231, 142,  82, 170, 146, 161, 281, 131, 276, 121,
        22, 121, 121, 281,  68, 244, 232,  42, 149, 105, 265,  14, 142,
       281])