In [1]:
import pandas as pd
import numpy as np
import torch
import json

import random
import sys
import os
sys.path.append('../../')
from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_data = []
with open('./train_data.jsonl', 'r') as f:
    for line in f:
        train_data.append(json.loads(line))
train_data[:5]

[{'token': 'Title: The New York Times Daily Crossword Puzzles: Thursday, Volume 1. Text: Monday’s Crosswords Do with EaseTuesday’s Crosswords Not a BreezeWednesday’s Crosswords Harder StillThursday’s Crosswords Take Real SkillFriday’s Crosswords — You’ve Come This Far…Saturday’s Crosswords — You’re a Star!For millions of people, the New York Times crossword puzzles are as essential to each day as the first cup of coffee in the morning. Now, for the first time ever, these premier puzzles are available in six clever installments. With each day of the week, the puzzles increase gradually in skill level; Monday’s the easiest, but Saturday’s sure to challenge! Push your mental muscles a little harder each day with America’s favorite sophisticated — and fun — pastime: the New York Times crossword puzzles!The legendary Eugene T. Maleska was crossword editor of The New York Times from 1977 to 1993.',
  'label': ['Nonfiction', 'Games']},
 {'token': 'Title: Creatures of the Night (Second Edition

In [4]:
with open('./value_dict.pt', 'rb') as f:
    value_dict = torch.load(f)
value_dict



{'Children’s Books': 0,
 'Poetry': 1,
 'Fiction': 2,
 'Nonfiction': 3,
 'Teen & Young Adult': 4,
 'Classics': 5,
 'Humor': 6,
 'Children’s Middle Grade Books': 7,
 'Step Into Reading': 8,
 'Fantasy': 9,
 'Spiritual Fiction': 10,
 'Literary Fiction': 11,
 'Gothic & Horror': 12,
 'Mystery & Suspense': 13,
 'Romance': 14,
 'Science Fiction': 15,
 'Women’s Fiction': 16,
 'Historical Fiction': 17,
 'Military Fiction': 18,
 'Western Fiction': 19,
 'Paranormal Fiction': 20,
 'Graphic Novels & Manga': 21,
 'Politics': 22,
 'Pets': 23,
 'Cooking': 24,
 'Parenting': 25,
 'Psychology': 26,
 'Sports': 27,
 'Travel': 28,
 'Games': 29,
 'History': 30,
 'Popular Science': 31,
 'Health & Fitness': 32,
 'Religion & Philosophy': 33,
 'Self-Improvement': 34,
 'Reference': 35,
 'Crafts, Home & Garden': 36,
 'Arts & Entertainment': 37,
 'Business': 38,
 'Biography & Memoir': 39,
 'Teen & Young Adult Mystery & Suspense': 40,
 'Teen & Young Adult Historical Fiction': 41,
 'Teen & Young Adult Nonfiction': 42,

In [7]:
hiera, _label_dict, r_hiera, depths = get_hierarchy_info('bgc.taxonomy')
hiera

defaultdict(set,
            {'Root': {'Children’s Books',
              'Classics',
              'Fiction',
              'Humor',
              'Nonfiction',
              'Poetry',
              'Teen & Young Adult'},
             'Children’s Books': {'Children’s Middle Grade Books',
              'Step Into Reading'},
             'Fiction': {'Fantasy',
              'Gothic & Horror',
              'Graphic Novels & Manga',
              'Historical Fiction',
              'Literary Fiction',
              'Military Fiction',
              'Mystery & Suspense',
              'Paranormal Fiction',
              'Romance',
              'Science Fiction',
              'Spiritual Fiction',
              'Western Fiction',
              'Women’s Fiction'},
             'Nonfiction': {'Arts & Entertainment',
              'Biography & Memoir',
              'Business',
              'Cooking',
              'Crafts, Home & Garden',
              'Games',
              'Health & Fitne

In [8]:
len(train_data)

58715

In [16]:
def hamming_distance_by_matrix(labels):
    return torch.matmul(labels, (1 - labels).T) + torch.matmul(1 - labels, labels.T)

In [14]:
# randomly sample 80 sample from train_data
random.seed(3)
batch_data = random.sample(train_data, 80)

batch_labels = []
for data in batch_data:
    batch_labels.append([value_dict[label] for label in data['label']])

# create one hot labels
one_hot_labels = []
for labels in batch_labels:
    one_hot_labels.append(torch.zeros(len(value_dict)))
    for label in labels:
        one_hot_labels[-1][label] = 1
one_hot_labels = torch.stack(one_hot_labels)
one_hot_labels.shape

torch.Size([80, 146])

In [29]:
from collections import Counter
hamming_dist = hamming_distance_by_matrix(one_hot_labels)

# get the upper triangle of the hamming distance matrix
upper_tri = torch.triu(hamming_dist, diagonal=1)

# flat the upper triangle
flat_upper_tri = upper_tri.flatten()

freq_list = dict(Counter(flat_upper_tri.numpy()))

# reduce the frequency of hamming distance 0 with (N + 1) * N / 2
freq_list[0] -= (len(one_hot_labels) + 1) * len(one_hot_labels) / 2
freq_list[0] = int(freq_list[0])

sorted(freq_list.items(), key=lambda x: x[1], reverse=True)

[(5.0, 636),
 (4.0, 578),
 (6.0, 461),
 (7.0, 398),
 (3.0, 320),
 (8.0, 273),
 (9.0, 114),
 (1.0, 100),
 (0.0, 92),
 (2.0, 91),
 (10.0, 61),
 (11.0, 31),
 (12.0, 5)]

In [31]:
1 + torch.tensor(19.0, dtype=torch.float64)

tensor(20., dtype=torch.float64)

In [35]:
a = torch.from_numpy(np.array([1, 2, 3]))
b = a.view(-1, 1)
print(b.shape, a.shape)

torch.Size([3, 1]) torch.Size([3])


In [33]:
a.view(1, -1).shape

torch.Size([1, 3])

In [5]:
a = np.array([1, 2, 3])

torch.tensor([torch.exp(torch.tensor(1/ (max(a) - a[i]))) for i in range(len(a))])

  This is separate from the ipykernel package so we can avoid doing imports until


tensor([1.6487, 2.7183,    inf], dtype=torch.float64)