Read and clean data

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import word_tokenize
import seaborn as sns
from nltk.corpus import stopwords
from collections import  Counter
import torch
from torch import nn

# Data pre-processing from part 1
df = pd.read_csv('nytimes_news.csv') # Load the dataset from a CSV
df = df.dropna(subset =['title', 'maintext']) # remove entries where the title or maintext are missing.
df = df.drop(columns=['Unnamed: 0','date_download','date_modify','date_publish','authors','filename','image_url','localpath','title_rss','source_domain','url','site','title_page'])
df = df.drop_duplicates(subset=['maintext'])

Tokenization, Train/Validation Split, Vocabulary list

In [3]:

# Tokenize article text
df['raw_tokens'] = df['maintext'].apply(lambda x: word_tokenize(x.lower()))

chars2remove = set(['.','!','/', '?', '\'', ','])

df['tokens'] = df['raw_tokens'].apply(lambda x: [w for w in x if w not in chars2remove])

# Tokenize title text
df['raw_title_tokens'] = df['title'].apply(lambda x: word_tokenize(x.lower()))
df['title_tokens'] = df['raw_title_tokens'].apply(lambda x: [w for w in x if w not in chars2remove])

#  Split train and validation sets
import random
random.seed(42)

df_full = df.copy()
assert df_full.shape[0] == 10819

indices = list(range(df_full.shape[0]))

random.shuffle(indices)

train_indices = indices[:8700]
test_indices = indices[8500:]

# df_train will be the in-sample training dataset
df_train = df_full.iloc[train_indices,:].copy()

# df_test will be the out-of-sample validation dataset
df_test = df_full.iloc[test_indices,:].copy()

# Create Vocab
def create_vocab_list(tokens_column):
    vocab = dict()
    for word_tokens in tokens_column:
        for token in word_tokens:
            if token not in vocab:
                vocab[token] = 1
            else:
                vocab[token] += 1
    return vocab

vocab_all = dict(sorted(create_vocab_list(df['tokens']).items(), key=lambda item: item[1], reverse=True))

df_train.shape: (8700, 8)
df_test.shape: (2319, 8)


Define Model

In [4]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # Define layers
        #self.encoder = nn.TransformerEncoder(input_size, output_size)
        #self.decoder = nn.TransformerDecoder(input_size, output_size)
        # Multiple layers of rnn at once
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        # Fully Connected Layer
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):

        batch_size = x.size(0)
    # initialize hidden state
        hidden = self.init_hidden(batch_size)

        out, hidden = self.rnn(x, hidden)

        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)

        return out, hidden

    def init_hidden(self, batch_size):

        # This method generates the first hidden state of zeros which we'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

AttributeError: module 'torch.nn' has no attribute 'Model'