In [1]:
import numpy as np
from hmm_utils import HMM
from params import *

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import random

#some other libraries
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from typing import List

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/roi.naveiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Perprocessing

In [2]:
data = pd.read_csv("data/ner.csv", encoding = "latin1")
data = data.fillna(method="ffill")
data = data.rename(columns={'Sentence #': 'sentence'})
data.head(5)

  data = data.fillna(method="ffill")


Unnamed: 0,sentence,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
def pre_processing(text_column):
    # lowercase all text in the column
    text_column = text_column.str.lower()

    # replacing numbers with NUM token
    text_column = text_column.str.replace(r'\d+', 'NUM')

    # removing stopwords
    stop_words = set(stopwords.words('english'))
    text_column = text_column.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    return text_column

data_pre_precessed = pre_processing(data.Word)
#creating new dataframe with preprocessed word as a column
data_processed = data
data_processed['Word'] = data_pre_precessed

#removing the rows where word is empty
data_processed = data_processed[(data_processed['Word'] != '') | (data_processed['Word'].isna())]

In [4]:
tags = list(set(data.POS.values))  # Unique POS tags in the dataset
words = list(set(data.Word.values))  # Unique words in the dataset
len(tags), len(words)

words1 = list(set(data_processed.Word.values))  # Unique words in the dataset
len(words1)

31682

# Select most common words 

In [5]:
# Most common words
N_w = 300
common_words = data_processed['Word'] .value_counts().sort_values(ascending=False)[:N_w].index
data_reduced = data_processed[data_processed['Word'].isin(common_words)]

tags = list(set(data_reduced.POS.values))  # Unique POS tags in the dataset
words = list(set(data_reduced.Word.values))  # Unique words in the dataset
len(tags), len(words)


(29, 300)

In [61]:
words = list(set(data_reduced.Word.values))
# Convert words and tags into numbers
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}
id2tag = {i: t for i, t in enumerate(tags)}
len(tags), len(words)
id2word = {}
for key in word2id:
    id2word[word2id[key]] = key


In [48]:
def seq2word(X):
    l = []
    for i in range(len(X)):
        l.append(id2word[X[i]])
    return l

## Create HMM manually

In [7]:
count_tags = dict(data_reduced.POS.value_counts())  # Total number of POS tags in the dataset
# Now let's create the tags to words count
count_tags_to_words = data_reduced.groupby(['POS']).apply(
    lambda grp: grp.groupby('Word')['POS'].count().to_dict()).to_dict()
# We shall also collect the counts for the first tags in the sentence
count_init_tags = dict(data_reduced.groupby('sentence').first().POS.value_counts())

# Create a mapping that stores the frequency of transitions in tags to it's next tags
count_tags_to_next_tags = np.zeros((len(tags), len(tags)), dtype=int)
sentences = list(data_reduced.sentence)
pos = list(data_reduced.POS)
for i in tqdm(range(len(sentences)), position=0, leave=True):
    if (i > 0) and (sentences[i] == sentences[i - 1]):
        prevtagid = tag2id[pos[i - 1]]
        nexttagid = tag2id[pos[i]]
        count_tags_to_next_tags[prevtagid][nexttagid] += 1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 329086/329086 [00:00<00:00, 1619516.90it/s]


In [8]:
startprob = np.zeros((len(tags),))
transmat = np.zeros((len(tags), len(tags)))
emissionprob = np.zeros((len(tags), len(words)))
num_sentences = sum(count_init_tags.values())
sum_tags_to_next_tags = np.sum(count_tags_to_next_tags, axis=1)
for tag, tagid in tqdm(tag2id.items(), position=0, leave=True):
    floatCountTag = float(count_tags.get(tag, 0))
    startprob[tagid] = count_init_tags.get(tag, 0) / num_sentences
    for word, wordid in word2id.items():
        emissionprob[tagid][wordid] = count_tags_to_words.get(tag, {}).get(word, 0) / floatCountTag
    for tag2, tagid2 in tag2id.items():
        transmat[tagid][tagid2] = count_tags_to_next_tags[tagid][tagid2] / sum_tags_to_next_tags[tagid]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 8311.80it/s]


# Build HMM

Include cutoff for probabilities equal to 0

In [9]:
cutoff = 0.001
startprob = startprob +  cutoff
startprob = startprob/ np.sum(startprob)
##
transmat =  transmat + cutoff
transmat = transmat / np.sum(transmat, axis=1)
##
emissionprob =  emissionprob + cutoff
emissionprob = emissionprob / np.sum(emissionprob, axis=1).reshape(-1,1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [10]:
hmm_n = HMM(len(tags), len(words))
hmm_n.startprob_ = startprob
hmm_n.transmat_ = transmat
hmm_n.emissionprob_ = emissionprob

# Attack HMM - APS

In [11]:
from solvers.aps_gibbs_class import aps_gibbs
from attackers.decoding_attacker import dec_attacker

In [12]:
T  = 53
n_obs = 300
n_hidden = 29
w1 = 0.0
w2 = 100.0 
k_value = 1000000.0
cool = np.arange(500,501, 1)
seq = 22*np.ones(T).astype(int)
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]

###
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]
    
A =  X.astype(int)

_, y_pred = hmm_n.nu(A[0])
y_pred

y = np.zeros(len(data_reduced.POS[:T]))
for i in range(len(data_reduced.POS[:T])):
    y[i] = tag2id[data_reduced.POS[:T].iloc[i]]
###

A =  X.astype(int)

In [13]:
rho_probs = np.ones(n_obs)
att = dec_attacker(hmm_n.startprob_ , hmm_n.transmat_, hmm_n.emissionprob_, rho_probs,
         A.T, w1, w2, seq, k_value)

In [14]:
find_sol = aps_gibbs(att, cool, burnin=0.1, verbose=True)
sol, samples = find_sol.iterate(simulation_seconds=None)

Percentage completed: 0.0
Current state [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0
1


KeyboardInterrupt: 

In [None]:
attack_obs = att.attack_X(np.ones_like(sol), sol)
attack_obs = attack_obs.squeeze().astype(int)
V, seq = hmm_n.nu(attack_obs)
probs, opt_s = hmm_n.nu(seq)

In [None]:
opt_s

In [None]:
np.argmax(sol, axis=1)

In [None]:
A[0] == np.argmax(sol, axis=1)

In [48]:
#word2id

# Attack HMM - RS

In [71]:
from solvers.nn_RS.nn_RS import nn_RS
from attackers.decoding_attacker import dec_attacker

In [72]:
T  = 53
n_obs = 300
n_hidden = 29
w1 = 3.0
w2 = 10.0 
k_value = 1000000.0
seq = 17*np.ones(T).astype(int)
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]

###
X = np.zeros( (1, len(data_reduced.Word.values[:T])) )
for i in range(len(data_reduced.Word.values[:T])):
    X[0, i] = word2id[data_reduced.Word.values[i]]
    
A =  X.astype(int)

_, y_pred = hmm_n.nu(A[0])
y_pred

y = np.zeros(len(data_reduced.POS[:T]))
for i in range(len(data_reduced.POS[:T])):
    y[i] = tag2id[data_reduced.POS[:T].iloc[i]]
###

A =  X.astype(int)

In [73]:
rho_probs = np.ones(n_obs)
att = dec_attacker(hmm_n.startprob_ , hmm_n.transmat_, hmm_n.emissionprob_, rho_probs,
         A.T, w1, w2, seq, k_value)

In [None]:
find_sol = nn_RS(att, "SA", RS_iters=10000, mcts_iters=10, sa_iters=10, eps=0.05, lr=0.005, verbose=True)
sol, samples = find_sol.iterate(simulation_seconds=None)

Percentage completed: 0.0
Best value: 
0.0


  return F.mse_loss(input, target, reduction=self.reduction)


Percentage completed: 0.5
Best value: 
194.0943434087174
Percentage completed: 1.0
Best value: 
197.3906491961464
Percentage completed: 1.5
Best value: 
197.3906491961464
Percentage completed: 2.0
Best value: 
197.3906491961464
Percentage completed: 2.5
Best value: 
200.93209820919256
Percentage completed: 3.0
Best value: 
200.93209820919256
Percentage completed: 3.5
Best value: 
200.93209820919256
Percentage completed: 4.0
Best value: 
200.93209820919256
Percentage completed: 4.5
Best value: 
200.93209820919256
Percentage completed: 5.0
Best value: 
203.0032315971511
Percentage completed: 5.5
Best value: 
203.0032315971511
Percentage completed: 6.0
Best value: 
203.0032315971511
Percentage completed: 6.5
Best value: 
203.0032315971511
Percentage completed: 7.0
Best value: 
205.25532986588678
Percentage completed: 7.5
Best value: 
205.25532986588678
Percentage completed: 8.0
Best value: 
205.25532986588678
Percentage completed: 8.5
Best value: 
210.3744325762876
Percentage completed: 9

In [31]:
attack_obs = att.attack_X(np.ones_like(sol), sol)
attack_obs = attack_obs.squeeze().astype(int)
V, seq = hmm_n.nu(attack_obs)
probs, opt_s = hmm_n.nu(seq)

In [32]:
opt_s

array([17, 17,  2, 17, 13, 12, 12,  2, 17, 17, 17, 17, 12, 12, 17,  2, 12,
       17, 12,  2, 17, 15,  2, 17, 12,  4,  2, 17,  4, 17,  4, 17, 13, 17,
       12, 17,  2, 12, 17, 12,  2,  2, 12,  4, 20, 17, 17,  4, 17,  4, 12,
       17, 17])

In [25]:
tag2id

{'CC': 0,
 'VBD': 1,
 'NN': 2,
 '``': 3,
 'NNP': 4,
 'RBR': 5,
 '$': 6,
 'LRB': 7,
 '.': 8,
 'NNPS': 9,
 'MD': 10,
 'JJR': 11,
 'JJ': 12,
 'VBN': 13,
 ',': 14,
 'VBP': 15,
 'RB': 16,
 'NNS': 17,
 'POS': 18,
 'IN': 19,
 'VB': 20,
 'RRB': 21,
 'CD': 22,
 'VBG': 23,
 ':': 24,
 'PRP': 25,
 'JJS': 26,
 'DT': 27,
 'VBZ': 28}

In [33]:
y

array([17.,  2.,  4., 12., 17.,  2.,  8., 17., 13.,  3.,  4.,  2., 22.,
        2.,  3.,  3.,  8.,  3.,  2.,  8., 17.,  2.,  8.,  4., 18.,  4.,
       12.,  8.,  2.,  4., 18.,  4., 12., 17.,  2.,  8.,  2.,  1., 14.,
       23., 14., 14.,  8.,  4.,  4.,  4., 12.,  2., 17.,  4.,  4., 18.,
        8.])

In [34]:
att.expected_utility(sol)

189.48903782265936

In [69]:
seq2word(np.argmax(sol, axis=1))

['deal',
 'court',
 'several',
 'afghanistan',
 'accused',
 'officials',
 'countries',
 'iranian',
 'washington',
 'turkey',
 'percent',
 'chavez',
 'one',
 'nations',
 'venezuela',
 'pakistani',
 'five',
 'fighting',
 'elections',
 'early',
 '(',
 'told',
 'nuclear',
 'town',
 'militants',
 'says',
 'russian',
 'fire',
 'confirmed',
 'percent',
 'left',
 'office',
 'found',
 'city',
 'media',
 'growth',
 'could',
 'officials',
 'growth',
 '10',
 'several',
 'prime',
 'attacks',
 'million',
 'make',
 'opposition',
 'force',
 "'",
 'town',
 'called',
 'reports',
 'india',
 'taleban']

In [68]:
seq2word(A[0])

['thousands',
 'war',
 'iraq',
 'british',
 'troops',
 'country',
 '.',
 'soldiers',
 'killed',
 '"',
 'bush',
 'number',
 'one',
 'terrorist',
 '"',
 '"',
 '.',
 '"',
 'parliament',
 '.',
 'police',
 'number',
 '.',
 'britain',
 "'s",
 'party',
 'southern',
 '.',
 'party',
 'britain',
 "'s",
 'iraq',
 'british',
 'troops',
 'country',
 '.',
 'march',
 'came',
 ',',
 'including',
 ',',
 ',',
 '.',
 'international',
 'energy',
 'agency',
 'second',
 'day',
 'talks',
 'wednesday',
 'iran',
 "'s",
 '.']

In [38]:
np.argmax(sol, axis=1)

array([243, 286, 113,  31, 200,  23, 146, 141,  83,   9,  47, 282, 222,
        39, 119,  86, 220, 250, 290, 160, 136,  35, 235, 261, 166, 102,
       132, 241, 143,  47, 228, 216, 120,  10,  56, 189, 100,  23, 189,
       112, 113, 124, 144, 234,  20, 205, 242,  48, 261, 105,  53, 155,
       224])

In [70]:
A[0] == np.argmax(sol, axis=1)

array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])

In [23]:
A[0]

array([116, 193,  98,  16, 171, 288, 108, 252,  13,  57, 215, 266, 222,
       258,  57,  57, 108,  57, 186, 108, 292, 266, 108,  87,  44, 213,
        51, 108, 213,  87,  44,  98,  16, 171, 288, 108,  84, 236, 253,
       153, 253, 253, 108,  34, 231,  69,  26, 172, 275,  55, 272,  44,
       108])

In [35]:
np.argmax(sol, axis=1)

array([ 80, 182,  82, 170, 146, 161, 281,  62, 187, 172, 268,  69, 279,
        47, 172, 172, 281, 172, 183, 281, 114,  69, 281, 231, 142,  56,
       116, 281,  56, 231, 142,  82, 170, 146, 161, 281, 131, 276, 121,
        22, 121, 121, 281,  68, 244, 232,  42, 149, 105, 265,  14, 142,
       281])