In [1]:
from gensim.models.word2vec import Word2Vec

import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import numpy as np
import pandas as pd

import torch
from torch.nn import Module, Linear, ReLU, Softmax, CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import time
import os
from tqdm import trange

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ccis229c\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()

        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [3]:
class NewsEmbeddingNetwork(Module):
    def __init__(self, input_dimension, hidden_dimension, latent_dimension, output_dimension):
        super().__init__()

        def init_parameters(layer, scale=1):
            torch.nn.init.xavier_normal_(layer.weight, gain=scale)
            torch.nn.init.zeros_(layer.bias)

        self.fc1 = Linear(in_features=input_dimension, out_features=hidden_dimension)
        init_parameters(self.fc1)
        self.fc2 = Linear(in_features=hidden_dimension, out_features=hidden_dimension)
        init_parameters(self.fc2)
        self.fc3 = Linear(in_features=hidden_dimension, out_features=latent_dimension)
        init_parameters(self.fc3)
        self.fc4 = Linear(in_features=latent_dimension, out_features=output_dimension)
        init_parameters(self.fc4)

        self.relu = ReLU()
        self.softmax = Softmax(dim=-1)

    def forward(self, input_vector):
        latent = self.relu(self.fc1(input_vector))
        latent = self.relu(self.fc2(latent))
        latent = self.softmax(self.fc3(latent))
        output = self.softmax(self.fc4(latent))

        return latent, output

In [4]:
seed=24
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic=True

In [5]:
# load news data
column_names = ['News_id', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title_Entities', 'Abstract_Entities']
news = pd.read_csv('./data/train_news.tsv', sep='\t', names=column_names)
news2 = pd.read_csv('./data/test_news.tsv', sep='\t', names=column_names)
news = pd.concat([news, news2], ignore_index=True)
del news2

print('data loaded')

data loaded


In [6]:
number_of_epoch = 10
learning_rate = 0.0025
batch_size = 128
embedding_dimension = 100
hidden_dimension = 128
latent_dimension = 256
output_dimension = len(news['Category'].unique())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
stop_words = set(stopwords.words('english'))

news_tokens = []
news_titles = news['Title'].values
for i in range(len(news_titles)):
    word_tokens_i = word_tokenize(news_titles[i])
    news_tokens.append([w for w in word_tokens_i if not w in stop_words])
news_tokens[0]

['The',
 'Brands',
 'Queen',
 'Elizabeth',
 ',',
 'Prince',
 'Charles',
 ',',
 'Prince',
 'Philip',
 'Swear',
 'By']

In [8]:
lemmatizer = WordNetLemmatizer()

news_tokens_lemmatized = []
for i in range(len(news_tokens)):
    word_tokens_i = []
    for j in news_tokens[i]:
        x = lemmatizer.lemmatize(j, pos='a')
        x = lemmatizer.lemmatize(x)
        word_tokens_i.append(x)
    news_tokens_lemmatized.append(word_tokens_i)

news_tokens_lemmatized[0]

['The',
 'Brands',
 'Queen',
 'Elizabeth',
 ',',
 'Prince',
 'Charles',
 ',',
 'Prince',
 'Philip',
 'Swear',
 'By']

In [9]:
# train word to vector model with title
model_w2v = Word2Vec(news_tokens_lemmatized, vector_size=embedding_dimension, min_count=1, workers=15)
print('word2vec model trained')

word2vec model trained


In [10]:
# map each news to mean of title w2v
train_x = np.empty((news.shape[0], embedding_dimension), dtype=np.float32)
pointer = 0
for i in news_tokens_lemmatized:
    vec_list = []
    for j in i:
        vec_list.append(model_w2v.wv[j])
    train_x[pointer] = np.mean(vec_list, axis=0)
    pointer += 1

# news_vector = np.load('./data/news_embedding/news_vector.npy')
train_x_tensor = torch.FloatTensor(train_x)

print('news mapped with word2vec model (input)')

news mapped with word2vec model (input)


In [11]:
train_y_ = pd.get_dummies(news['Category']).astype(float)

columns = news['Category'].value_counts().index
train_y_ = train_y_[columns]
train_y = train_y_.values

_, counts = np.unique(train_y, axis=0, return_counts=True)
counts = counts[::-1]
class_weights = torch.Tensor(1/(counts / counts.sum())).to(device)

train_y_tensor = torch.LongTensor(train_y)

print('news category (target) encoded')

# %%
news_dataset = CustomDataset(train_x, train_y)

train_loader = DataLoader(news_dataset, batch_size=batch_size, shuffle=True)

news category (target) encoded


In [12]:
class_weights

tensor([3.2379e+00, 3.3181e+00, 1.6937e+01, 2.0819e+01, 2.1653e+01, 2.2035e+01,
        2.3029e+01, 2.4631e+01, 3.1446e+01, 3.2708e+01, 7.2252e+01, 7.9647e+01,
        1.1118e+02, 1.1632e+02, 1.3884e+03, 4.3388e+04, 8.6775e+04, 1.7355e+05],
       device='cuda:0')

In [13]:
class MCELoss_weight(Module):
    def __init__(self, weight):
        super().__init__()
        self.weight = weight

    def forward(self, target, predict):
        predict = torch.clamp(predict, min=1e-6, max=1-1e-6)
        mce = torch.mean(-self.weight * target * torch.log(predict))
        return mce

In [14]:
if not os.path.exists('./runs/'):
    os.makedirs('./runs/')

if not os.path.exists('./model/'):
    os.makedirs('./model/')

writer = SummaryWriter(log_dir=f'./runs/news_embedding_{time.strftime("%Y%m%d-%H%M%S")}')

network = NewsEmbeddingNetwork(input_dimension=embedding_dimension,
                               hidden_dimension=hidden_dimension,
                               latent_dimension=latent_dimension,
                               output_dimension=output_dimension).to(device)

optimizer = torch.optim.Adam(network.parameters(), lr=learning_rate)
criterion = MCELoss_weight(weight=class_weights).to(device)

start_time = time.time()
for epoch in trange(number_of_epoch):
    network.to(device)
    predicts = np.empty((len(train_x),output_dimension))
    labels = np.empty((len(train_x), output_dimension))
    pointer = 0
    for i, (x, y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        latent, predict = network.forward(x)
        loss = criterion(predict, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            predict = predict.cpu().numpy()
            y = y.cpu().numpy()
            mb_size = len(predict)
            predicts[pointer:pointer+mb_size] = predict
            labels[pointer:pointer+mb_size] = y
            pointer += mb_size

        del x, y, predict

    writer.add_scalar('Loss', loss, epoch)

    auc = roc_auc_score(labels, predicts, multi_class='ovr')

    writer.add_scalar('AUC', auc, epoch)

    predicts = np.argmax(predicts, axis=1)
    writer.add_scalar('Predict', len(np.unique(predicts)), epoch)

print(f'news embedding train time: {time.time() - start_time} seconds')
writer.close()

100%|██████████| 10/10 [00:46<00:00,  4.69s/it]

news embedding train time: 46.863362073898315 seconds





In [15]:
with torch.no_grad():
    network.to(device)
    predicts = np.empty((len(train_x),output_dimension))
    latents = np.empty((len(train_x), latent_dimension))
    pointer = 0
    for i, (x, y) in enumerate(train_loader):
        x = x.to(device)
        y = y.to(device)
        latent, predict = network.forward(x)

        predict = predict.cpu().numpy()
        latent = latent.cpu().numpy()
        mb_size = len(predict)
        predicts[pointer:pointer+mb_size] = predict
        latents[pointer:pointer+mb_size] = latent
        pointer += mb_size

        del x, y, predict, latent

predicts = np.argmax(predicts, axis=1)
print(np.unique(predicts))

[0]


In [None]:
if not os.path.exists('./model/'):
    os.makedirs('./model/')
torch.save(network.state_dict(), 
           f'./model/news_embedding_{time.strftime("%Y%m%d-%H%M%S")}_{auc:.2f}.pth')

In [16]:
if not os.path.exists('./data/news_embedding/'):
    os.makedirs('./data/news_embedding/')
np.save('./data/news_embedding/news_vector.npy', train_x)
np.save('./data/news_embedding/news_embedding_vector.npy', latents)