In [1]:
from gensim.models.word2vec import Word2Vec

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import numpy as np
import pandas as pd

import torch
from torch.nn import *
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, default_collate
from torch.utils.tensorboard import SummaryWriter

import time
import os

In [2]:
class NewsDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()

        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [14]:
class NewsEmbeddingNetwork(Module):
    def __init__(self, input_dimension, hidden_dimension, latent_dimension, id_dimension):
        super().__init__()

        def init_parameters(layer, scale=1):
            torch.nn.init.xavier_normal_(layer.weight, gain=scale)
            torch.nn.init.zeros_(layer.bias)

        self.fc1 = Linear(in_features=input_dimension, out_features=hidden_dimension)
        init_parameters(self.fc1)
        self.fc2 = Linear(in_features=hidden_dimension, out_features=latent_dimension)
        init_parameters(self.fc2)
        self.fc3 = Linear(in_features=latent_dimension, out_features=hidden_dimension)
        init_parameters(self.fc3)
        self.fc4 = Linear(in_features=hidden_dimension, out_features=id_dimension)
        init_parameters(self.fc4)

        self.relu = ReLU()
        self.softmax = Softmax(dim=-1)

    def forward(self, input_vector):
        latent = self.relu(self.fc1(input_vector))
        latent = self.relu(self.fc2(latent))
        latent2 = self.softmax(self.fc3(latent))
        id = self.softmax(self.fc4(latent2))

        return latent, id

In [4]:
seed=24
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic=True

In [5]:
# load news data
column_names = ['News_id', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title_Entities', 'Abstract_Entities']
news = pd.read_csv('./data/train_news.tsv', sep='\t', names=column_names)
news2 = pd.read_csv('./data/test_news.tsv', sep='\t', names=column_names)
news = pd.concat([news, news2], axis=0)
print('data loaded')

data loaded


In [7]:
# train word to vector model with title
embedding_dimension = 100
corpus = [x.split() for x in news['Title']]
model_w2v = Word2Vec(corpus, vector_size=embedding_dimension, min_count=1, workers=15)
print('word2vec model trained')

word2vec model trained


In [8]:
# map each news to mean of title w2v
news_vector = np.empty((news.shape[0], embedding_dimension), dtype=np.float32)
pointer = 0
for i in corpus:
    vec_list = []
    for j in i:
        vec_list.append(model_w2v.wv[j])
    news_vector[pointer] = np.mean(vec_list, axis=0)
    pointer += 1

# news_vector = np.load('./data/news_embedding/news_vector.npy')
news_vector_tensor = torch.FloatTensor(news_vector)

print('news mapped with word2vec model (input)')

news mapped with word2vec model (input)


In [9]:
news_category = LabelEncoder().fit_transform(news['Category'].values)

news_category_tensor = torch.LongTensor(news_category)

print('news category (target) encoded')

news category (target) encoded


In [10]:
news_dataset = NewsDataset(news_vector_tensor, news_category_tensor)

batch_size=128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(news_dataset, batch_size=batch_size, shuffle=True,
                          collate_fn=lambda x: tuple(x_.to(device) for x_ in default_collate(x)))

output_dimension = len(news['Category'].unique())

In [15]:
writer = SummaryWriter(log_dir=f'./runs/news_embedding_{time.strftime("%Y%m%d-%H%M%S")}')

learning_rate = 0.005

hidden_dimension = 256
latent_dimension = 256

network = NewsEmbeddingNetwork(input_dimension=embedding_dimension,
                               hidden_dimension=hidden_dimension,
                               latent_dimension=latent_dimension,
                               id_dimension=output_dimension).to(device)

optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
criterion = CrossEntropyLoss()

start_time = time.time()
for epoch in range(100):
    for i, (x, y) in enumerate(train_loader):
        latent, id = network.forward(x)
        loss = criterion(id, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    writer.add_scalar('Loss', loss, epoch)

    with torch.no_grad():
        _, id = network.forward(news_vector_tensor.to(device))
        id = id.cpu().numpy()
        predict = np.argmax(id, axis=1)
        label = news_category
        accuracy = accuracy_score(label, predict)
        f1 = f1_score(label, predict, average='weighted')
        auc = roc_auc_score(label, id, multi_class='ovr')


    writer.add_scalar('Accuracy', accuracy, epoch)
    writer.add_scalar('F1', f1, epoch)
    writer.add_scalar('AUC', auc, epoch)

print(f'news embedding train time: {time.time() - start_time} seconds')
writer.close()

Exception in thread Thread-167:
Traceback (most recent call last):
  File "c:\Users\ccis229c\Anaconda3\envs\ml\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Users\ccis229c\AppData\Roaming\Python\Python310\site-packages\tensorboard\summary\writer\event_file_writer.py", line 233, in run
    self._record_writer.write(data)
  File "C:\Users\ccis229c\AppData\Roaming\Python\Python310\site-packages\tensorboard\summary\writer\record_writer.py", line 40, in write
    self._writer.write(header + header_crc + data + footer_crc)
  File "C:\Users\ccis229c\AppData\Roaming\Python\Python310\site-packages\tensorboard\compat\tensorflow_stub\io\gfile.py", line 766, in write
    self.fs.append(self.filename, file_content, self.binary_mode)
  File "C:\Users\ccis229c\AppData\Roaming\Python\Python310\site-packages\tensorboard\compat\tensorflow_stub\io\gfile.py", line 160, in append
    self._write(filename, file_content, "ab" if binary_mode else "a")
  File "C:\Users\ccis229c\Ap

In [None]:
with torch.no_grad():
    _, id = network.forward(news_vector_tensor.to(device))
    id = id.cpu().numpy()
    predict = np.argmax(id, axis=1)
    print(np.unique(predict))

[13]


In [11]:
if not os.path.exists('./model/'):
    os.makedirs('./model/')
torch.save(network.state_dict(), 
           f'./model/news_embedding_{time.strftime("%Y%m%d-%H%M%S")}_{auc:.2f}.pth')

In [13]:
latent, _ = network.forward(news_vector_tensor.to(device))
latent = latent.cpu().detach().numpy()

if not os.path.exists('./data/news_embedding/'):
    os.makedirs('./data/news_embedding/')
np.save('./data/news_embedding/news_vector.npy', news_vector)
np.save('./data/news_embedding/news_embedding_vector.npy', latent)