In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import make_pipeline

from scipy import sparse

import numpy as np
import math
import pickle
from operator import itemgetter

from game import Game, Minus_Game, Minus_Text_Game
from agent import Agent, Minus_Agent

## Load and prepare Dataset

In [2]:
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [3]:
# converting text to vectors
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

## Load Model

In [4]:
filename = './models/newsgroup_model.sav'
model = pickle.load(open(filename, 'rb'))

In [5]:
pred = model.predict(vectors_test)

In [6]:
metrics.f1_score(newsgroups_test.target, pred, average='macro')

0.7948197892406187

In [7]:
pred

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,

In [8]:
np.asarray(vectors_test.todense())

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
model.predict_proba(vectors_test)

array([[4.79110000e-01, 5.20890000e-01],
       [3.27028433e-01, 6.72971567e-01],
       [9.99986356e-01, 1.36444254e-05],
       ...,
       [2.81025943e-01, 7.18974057e-01],
       [4.90392825e-04, 9.99509607e-01],
       [1.15744913e-02, 9.88425509e-01]])

In [10]:
model.predict(vectors_test[0])

array([1], dtype=int64)

In [11]:
np.argmax(model.predict_proba(vectors_test[0]))

1

In [12]:
model.predict_proba(vectors_test[0])[0, 0]

0.4791099996266649

In [13]:
model_pipe = make_pipeline(vectorizer, model)

## Explain

In [14]:
model_pipe.predict_proba(newsgroups_test.data)

array([[4.79110000e-01, 5.20890000e-01],
       [3.27028433e-01, 6.72971567e-01],
       [9.99986356e-01, 1.36444254e-05],
       ...,
       [2.81025943e-01, 7.18974057e-01],
       [4.90392825e-04, 9.99509607e-01],
       [1.15744913e-02, 9.88425509e-01]])

In [15]:
len(newsgroups_test.data)

717

In [16]:
newsgroups_test.data[2]

'Maddi: >>\n\n\n\nNo, no, no!  I\'ve already been named by "Killfile" Keith.\nMy nickname is Maddi "Never a Useful Post" Hausmann, and\ndon\'t you DARE forget it, "Half".\n\n\nYou really should quote Ivan Karamazov instead(on a.a), as he was\nthe atheist.\n\n-- \nMaddi Hausmann                       madhaus@netcom.com\nCentigram Communications Corp        San Jose California  408/428-3553'

In [17]:
idx = 2

In [18]:
#sample = np.array(vectors_test[idx].todense())
sample = newsgroups_test.data[idx]

In [19]:
sample

'Maddi: >>\n\n\n\nNo, no, no!  I\'ve already been named by "Killfile" Keith.\nMy nickname is Maddi "Never a Useful Post" Hausmann, and\ndon\'t you DARE forget it, "Half".\n\n\nYou really should quote Ivan Karamazov instead(on a.a), as he was\nthe atheist.\n\n-- \nMaddi Hausmann                       madhaus@netcom.com\nCentigram Communications Corp        San Jose California  408/428-3553'

In [20]:
type(sample)

str

In [21]:
target_label = newsgroups_test.target[idx]

In [22]:
target_label

0

In [23]:
model_pipe.predict([sample])

array([0], dtype=int64)

In [24]:
game = Minus_Text_Game(sample, model_pipe.predict_proba, target_label)

In [25]:
len(game.available_actions)

46

In [26]:
game.available_actions

[[' Maddi '],
 [' No '],
 [' no '],
 [' ve '],
 [' already '],
 [' been '],
 [' named '],
 [' by '],
 [' Killfile '],
 [' Keith '],
 [' My '],
 [' nickname '],
 [' is '],
 [' Never '],
 [' Useful '],
 [' Post '],
 [' Hausmann '],
 [' and '],
 [' don '],
 [' you '],
 [' DARE '],
 [' forget '],
 [' it '],
 [' Half '],
 [' You '],
 [' really '],
 [' should '],
 [' quote '],
 [' Ivan '],
 [' Karamazov '],
 [' instead '],
 [' on '],
 [' as '],
 [' he '],
 [' was '],
 [' the '],
 [' atheist '],
 [' madhaus '],
 [' netcom '],
 [' com '],
 [' Centigram '],
 [' Communications '],
 [' Corp '],
 [' San '],
 [' Jose '],
 [' California ']]

In [27]:
type(game.available_actions[0])

state.Text_State

In [28]:
agent = Minus_Agent(game, c=math.sqrt(2))

In [None]:
agent.run(n_edges=1)

2021-09-27 14:55:46,754 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
2021-09-27 14:55:46,756 - agent - INFO - Round:	0
2021-09-27 14:55:46,757 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
2021-09-27 14:56:26,205 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
2021-09-27 14:56:26,205 - agent - INFO - Round:	1000
2021-09-27 14:56:26,206 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
2021-09-27 14:57:05,353 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
2021-09-27 14:57:05,353 - agent - INFO - Round:	2000
2021-09-27 14:57:05,354 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
2021-09-27 14:57:47,258 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
2021-09-27 14:57:47,259 - agent - INFO - Round:	3000
2021-09

2021-09-27 15:15:18,956 - agent - INFO - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


In [None]:
ranks_0, ranks_1, path_0, path_1 = agent.get_best_path_as_list()

In [None]:
ranks_0

In [None]:
ranks_1

In [None]:
def mask(sample, predict, explanation):
    masked = sample
    arg = np.argmax(predict([sample]))
    n_actions = 0        
    while arg == target_label:
        if len(explanation) <= 0:
            break
        maxword = max(explanation, key=itemgetter(1))[0]
        amax = explanation.index(max(explanation, key=itemgetter(1)))
        del explanation[amax]
        masked = masked.replace(maxword, "")
        arg = np.argmax(predict([masked]))
        n_actions += 1
    if arg == target_label:
        return -1, masked
    else:
        return n_actions, masked

In [None]:
n, masked = mask(sample, model_pipe.predict_proba, ranks_0)

In [None]:
n

In [None]:
masked

In [None]:
model_pipe.predict_proba([masked])

In [None]:
a = agent.root.get_infor_of_edges()
a.columns = range(a.shape[1])
b = a.loc['N',:]>0

In [None]:
a.loc[:,b.values]

In [None]:
d0 = 0
d1 = 1
t0 = 0
t1 = 0
states = []
m_state = None
n0 = 0
n1 = 0
for node in agent.mct.tree:
    if node.player == 0:
        t0 += 1
    else:
        t1 +=1
    if node.player == 0 and node.N >= 100:
        n0 += 1
    elif node.player == 1 and node.N1 >= 100:
        n1 += 1
    if node.player == 0 and node.game_is_done:
        d0 += 1
    elif node.player == 1 and node.game_is_done:
        d1 += 1
        #print(node.parent_edge.get_winrate())

In [None]:
print(t0)
print(t1)
print(d0)
print(d1)
print(n0)
print(n1)