In [2]:
import pandas as pd
import os
from collections import Counter, defaultdict
import json
import itertools
import numpy as np
# load config
with open('config.json', 'r') as f:
    config = json.load(f)
cwd = os.getcwd()
os.chdir(config['REPODIR'])
import Utils as U
from Corpus import Corpus
os.chdir(cwd)

import pickle as pkl

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate

np.random.seed(42)
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from tqdm.auto import tqdm, trange
from collections import Counter
import random
from torch import optim

from torch.nn.utils.rnn import pad_sequence

import pandas as pd

from torch.utils.tensorboard import SummaryWriter
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import seaborn as sns

# Sort of smart tokenization
from nltk.tokenize import RegexpTokenizer

# Attention plotting
import matplotlib.pyplot as plt

from AuthorClassifier_We import DocumentAttentionClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
suffix = "tiny"
# +
with open('embedding_data_final.pkl', 'rb') as f:
    embed = pkl.load(f)

embed_df = pd.DataFrame(embed)
# -


embed_df = pd.DataFrame(embed)
data = U.load_file(f'data_vF_{suffix}.pkl', 'pkl', config['DATADIR'])

import gensim.downloader as api
gensim_model = api.load("glove-wiki-gigaword-300") 
# -

embedding_weights = gensim_model.vectors
# +


embedding_dim = gensim_model.vector_size
num_embeddings = embedding_weights.shape[0]

embedding_layer = nn.Embedding(num_embeddings, embedding_dim)
embedding_layer.weight.data.copy_(torch.from_numpy(embedding_weights))
# -

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# +
data_df = pd.DataFrame(data)
data_df.head()

data_df['joined_text'] = data_df['text'].apply(' '.join)
# -

text = data_df['joined_text'].to_numpy()
text_clean = [tokenizer.tokenize(t.lower()) for t in text]

word2index = {token: token_index for token_index, token in enumerate(gensim_model.index_to_key)}
index2word = {val:key for key, val in word2index.items()}

tokenized_sequences = [[word2index[word] for word in text if word in word2index ] for text in text_clean]

lengths = [len(seq) for seq in tokenized_sequences]
max_len = max(lengths)
min_len = min(lengths)



# +
test = tokenized_sequences[0]
# torch.IntTensor(tokenized_sequences)

data_df['sequences'] = tokenized_sequences

# +
# embed_df = embed_df.dropna(subset = ['author_id', 'sent_embedding'])

# +

n_classes = data_df.author_id.nunique()

from sklearn.preprocessing import OneHotEncoder
# label_encoder=OneHotEncoder(sparse_output=False)
label_encoder=OneHotEncoder()

# -

y= label_encoder.fit_transform(data_df['author_id'].to_numpy(dtype='int32').reshape(-1,1))
y = y.toarray()
X = data_df['sequences'].apply(np.array)

X = [(toop[0], toop[1]) for toop in list(zip(X, lengths))]

X[0]


def pad_with(vector, pad_width, iaxis, kwargs):
    pad_value = kwargs.get('padder', 0)
    vector[:pad_width[0]] = pad_value
    vector[-pad_width[1]:] = pad_value


# +
MAX_LENGTH = 128

X_trunc = []

for x in X:
    deficit = MAX_LENGTH - x[1]
    if deficit > 0:
        X_trunc.append(x[0])
    else:
        X_trunc.append(x[0][:MAX_LENGTH])

# -

X_trunc.sort(key=lambda x: len(x),reverse = True)

len(X_trunc[-1])

# +
torch.tensor(X_trunc[0]).shape

tensor_list = [torch.tensor(x).unsqueeze(1) for x in X_trunc]
# -

tensor_list[3].shape

X_t = pad_sequence([torch.tensor(x) for x in X_trunc], batch_first=True)


[xt.shape for xt in X_t]

# +
test_size = 0.2
val_size = 0.2
random_state =699

X_train, X_test, y_train, y_test = U.train_test_split(X_t, y, test_size=test_size,
                                                        random_state=random_state,
                                                        stratify=y)

# Split train set into train and validation sets
X_train, X_val, y_train, y_val = U.train_test_split(X_train, y_train, test_size=val_size/(1-test_size),
                                                    random_state=random_state,
                                                    stratify=y_train)
# -

In [4]:
model = DocumentAttentionClassifier()

model.load_state_dict(torch.load("path"))

TypeError: load_state_dict() missing 1 required positional argument: 'state_dict'

In [4]:
embed_df = embed_df.merge(data_df, how= 'left', left_on= 'passage_key', right_on = 'passage_key')
embed_df = embed_df.dropna(subset=['author_id', 'sent_embedding'])

In [17]:
def get_label_and_weights(text, kwargs):
    '''
    Classifies the text (requires tokenizing, etc.) and returns (1) the classification label, 
    (2) the tokenized words in the model's vocabulary, 
    and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the
    attention weights will be a matrix, depending on how many heads were used in training.
    '''

    clean_text = tokenizer.tokenize(text)
    text_ids = [word_to_index.get(x,unk_token_id) for x in clean_text]
    tokens = [index_to_word[idx] for idx in text_ids]

    with torch.no_grad():
        model.eval()
        tensor_load = DataLoader([np.array(text_ids)], batch_size=1, shuffle=False, collate_fn=collate_func, **kwargs)
        tensor_ids = next(iter(tensor_load))
        print(tensor_ids)
        output, weights = model(tensor_ids[0])
        output, weights = output.cpu().numpy(), weights.cpu().numpy()

        output = np.array([1 if output >= 0.5 else 0], dtype = int)

    return output, tokens, weights


# ### Helper functions for visualization

def visualize_attention(words, attention_weights):
    '''
    Makes a heatmap figure that visualizes the attention weights for an item.
    Attention weights should be a numpy array that has the shape (num_words, num_heads)
    '''
    fig, ax = plt.subplots() 
    # Rescale image size based on the input length
    fig.set_size_inches((len(words), 4))    
    im = ax.imshow(attention_weights.T)

    head_labels = [ 'head-%d' % h for h in range(attention_weights.shape[1])]
    ax.set_xticks(np.arange(len(words))) # , labels=words)
    ax.set_yticks(np.arange(len(head_labels))) #, labels=head_labels)

    # Rotate the word labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # Add the words and axis labels
    ax.set_yticklabels(labels=range(attention_weights.shape[1]), fontsize=16)
    ax.set_ylabel('Attention Head', fontsize=16)
    ax.set_xticklabels(labels=words, fontsize=16)

    # Add a color bar to show probability scaling
    cb = fig.colorbar(im, ax=ax, label='Probability', pad = 0.01)
    cb.ax.tick_params(labelsize=16)
    cb.set_label(label='Probability',size=16)
    fig.tight_layout()
    plt.show()

In [18]:
y_lda_topics_train
y_lda_topics_val

['Short stories',
 'Juvenile fiction',
 'United States',
 'Short stories',
 'Poetry',
 'Short stories',
 'Adventure stories',
 'Juvenile fiction',
 'Poetry',
 'Juvenile fiction',
 'Juvenile fiction',
 'United States',
 'Short stories',
 'Short stories',
 'History',
 'Juvenile fiction',
 'Juvenile fiction',
 'Fiction',
 'Poetry',
 'United States',
 'Poetry',
 '19th century',
 'Short stories',
 'United States',
 'Poetry',
 'Juvenile fiction',
 'Adventure stories',
 'Fiction',
 'Short stories',
 'Juvenile fiction',
 'Fiction',
 'Short stories',
 'Short stories',
 'Juvenile fiction',
 'Short stories',
 'Poetry',
 'Juvenile fiction',
 '19th century',
 'Poetry',
 'Poetry',
 'Translations into English',
 'Fiction',
 'Juvenile fiction',
 'Short stories',
 '19th century',
 'History',
 '19th century',
 'Juvenile fiction',
 'Juvenile fiction',
 'United States',
 'Juvenile fiction',
 'Poetry',
 'Short stories',
 'Translations into English',
 'United States',
 'Poetry',
 'Short stories',
 'Translat

In [1]:
X_train.todense()

NameError: name 'X_train' is not defined

In [20]:


np.mean(train['topic'] == y_lda_topics_train)

0.05738029610312443

In [21]:
labels_val

(array([0, 1, 2, ..., 0, 0, 5]),
 Index(['Fiction', 'Juvenile fiction', 'Description and travel', 'Drama',
        'Short stories', 'Poetry', 'World War', 'Biography',
        'Detective and mystery stories', '19th century',
        'Translations into English', 'History', 'Juvenile literature',
        'Science fiction', 'English', 'Great Britain', 'Children', 'France',
        'Conduct of life', 'Adventure stories', 'American', 'United States',
        'English wit and humor', 'England', 'Social life and customs',
        'Love stories', 'Historical fiction', 'Young women',
        'Personal narratives'],
       dtype='object'))

In [77]:
pred_val

array([0, 0, 0, ..., 0, 0, 0])

In [22]:

# Train a classifier on the LDA topics
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(y_val, labels_val)

# Evaluate performance on the validation set
pred_val = clf.predict(y_val)
acc_val = accuracy_score(labels_val, pred_val)
print('Accuracy on validation set:', acc_val)

# Evaluate performance on the test set
pred_test = clf.predict(y_test)
acc_test = accuracy_score(labels_test, pred_test)
print('Accuracy on test set:', acc_test)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.