In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from textblob import TextBlob
from datetime import datetime

import spacy
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error
from sklearn.utils import class_weight

#from torchsampler import ImbalancedDatasetSampler

#from imblearn.over_sampling import SMOTE

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

from collections import Counter
import re
import string
from random import randint

In [2]:
EPOCHS = 100
BATCH_SIZE = 700
LEARNING_RATE = 0.001

In [3]:
# Read in CSV
df = pd.read_csv("../data/train_80.csv", sep="|")
df_test = pd.read_csv("../data/test_80.csv", sep="|")

## Remove stop words (from content and title)

In [4]:
# Remove stop words

stop_words = set(stopwords.words('english'))

df['content'] = df.content.str.replace("[^\w\s]","").str.lower()
df['content'] = df['content'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

df_test['content'] = df_test.content.str.replace("[^\w\s]","").str.lower()
df_test['content'] = df_test['content'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

df['title'] = df.title.str.replace("[^\w\s]","").str.lower()
df['title'] = df['title'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

df_test['title'] = df_test.title.str.replace("[^\w\s]","").str.lower()
df_test['title'] = df_test['title'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))


## Data Augmentation

In [5]:
# label 0 -> gen 5 for each
# label 1 -> skip
# label 2 -> gen 2
# label 3 -> gen 15

def get_syn(word):
    replacements = []
    for syn in wn.synsets(word):
        syn_word = syn.name().split('.')[0]
        if syn_word != word and re.match('\w', word):
            replacements.append(syn_word)
            
    if len(replacements) > 0:
        return replacements[randint(0, len(replacements)-1)]
    else:
        return ''

def random_insertion(sentence, alpha=0.5):
    new_sentence = sentence.copy()
    for i in range(int(len(sentence) * alpha)):
        syn = get_syn(sentence[randint(0, len(sentence) - 1)])
        if syn:
            new_sentence.insert(randint(0, len(new_sentence)), syn)
    return new_sentence

def random_replacement(sentence, alpha=0.5):
    new_sentence = sentence.copy()
    for i in range(int(len(sentence) * alpha)):
        word_id = randint(0, len(sentence) - 1)
        syn = get_syn(sentence[word_id])
        if syn:
            new_sentence[word_id] = syn
    return new_sentence

def augment(dataframe):
    df = dataframe   
    df = df[["title", "content", "label_log_10"]]
    new_sentences_df = pd.DataFrame(columns=['title','content','label_log_10'])
    for index, row in df.iterrows():
        if row["label_log_10"] is 0:
            for i in range (5):
                tmp = " "
                tmp_title = " "
                str_list = list(row["content"].split(" "))
                title_list = list(row["title"].split(" "))
                #ew_sentence = random_insertion(str_list)
                #ew_title = random_insertion(title_list)
                new_sentence = random_replacement(str_list)
                new_title = random_replacement(title_list)
                new_sentence = tmp.join(new_sentence)
                new_title = tmp_title.join(new_title)
                new_sentences_df = new_sentences_df.append(pd.DataFrame([[new_title, new_sentence, row["label_log_10"]]], columns=['title','content','label_log_10']))
            continue
        if row["label_log_10"] is 2:
            for j in range (2):
                tmp = " "
                tmp_title = " "
                str_list = list(row["content"].split(" "))
                title_list = list(row["title"].split(" "))
                #new_sentence = random_insertion(str_list)
                #new_title = random_insertion(title_list)
                new_sentence = random_replacement(str_list)
                new_title = random_replacement(title_list)
                new_sentence = tmp.join(new_sentence)
                new_title = tmp_title.join(new_title)
                new_sentences_df = new_sentences_df.append(pd.DataFrame([[new_title, new_sentence, row["label_log_10"]]], columns=['title','content','label_log_10']))
            continue
        if row["label_log_10"] is 3:
            for k in range (15):
                tmp = " "
                tmp_title = " "
                str_list = list(row["content"].split(" "))
                title_list = list(row["title"].split(" "))
                #new_sentence = random_insertion(str_list)
                #new_title = random_insertion(title_list)
                new_sentence = random_replacement(str_list)
                new_title = random_replacement(title_list)
                new_sentence = tmp.join(new_sentence)
                new_title = tmp_title.join(new_title)
                new_sentences_df = new_sentences_df.append(pd.DataFrame([[new_title, new_sentence, row["label_log_10"]]], columns=['title','content','label_log_10']))
            continue

    df = df.append(new_sentences_df)
    return df

df = augment(df)
df_test = augment(df_test)

In [6]:
# combine title and text, delete other columns
df["full_text"] = df['title'] + ' ' + df["content"]
#df["full_text"] =  df["content"]
df = df[["full_text", "label_log_10"]]

df_test["full_text"] = df_test['title'] + ' ' + df_test["content"] 
df_test = df_test[["full_text", "label_log_10"]]

In [7]:
train = df
test = df_test

In [8]:
def balance_classes(dataframe):
    train = dataframe
    count_dict = {
        "0":0,
        "1":0,
        "2":0,
        "3":0
    }

    for index, row in train.iterrows():
        if row['label_log_10'] == 0.0:
            count_dict["0"] += 1
        elif row['label_log_10'] == 1.0:
            count_dict["1"] += 1
        elif row['label_log_10'] == 2.0:
            count_dict["2"] += 1
        else:
            count_dict["3"] += 1

    print(count_dict)

    train_0 = train[train.label_log_10 == 0]
    train_1 = train[train.label_log_10 == 1]
    train_2 = train[train.label_log_10 == 2]
    train_3 = train[train.label_log_10 == 3]

    train = train_0.sample(n=count_dict["3"])
    train = train.append(train_1.sample(n=count_dict["3"]))
    train = train.append(train_2.sample(n=count_dict["3"]))
    train = train.append(train_3.sample(n=count_dict["3"]))
    train = train.sample(frac=1)
    #print(train.shape)
    return train

train = balance_classes(train)
test = balance_classes(test)

{'0': 8652, '1': 15104, '2': 17316, '3': 7344}
{'0': 2118, '1': 3809, '2': 4275, '3': 1712}


## Word Tokenizer

In [9]:
#tokenization
tok = spacy.load('en')
def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [10]:
#count number of occurences of each word in train set
counts = Counter()
for index, row in train.iterrows():
    counts.update(tokenize(row['full_text']))

In [11]:
# creating vocab
vocab2index = {"":0, "UNK":1}
words = ["","UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [12]:
def encode_sentence(text, vocab2index, N=500):
    tokenized = tokenize(text)
    encoded = np.zeros(N,dtype=int)
    enc1 = np.array([vocab2index.get(word,vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

## GloVe Embeddings

In [13]:
def load_glove_vectors(glove_file="../data/glove.6B.100d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [14]:
def get_emb_matrix(pretrained, word_counts, emb_size = 100):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    vocab_to_idx[""] = 0
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [15]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [16]:
train['encoded'] = train['full_text'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))
test['encoded'] = test['full_text'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))

In [17]:
X_train, y_train = list(train['encoded']), list(train['label_log_10'])
X_valid, y_valid = list(test['encoded']), list(test['label_log_10'])

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [19]:
class NewsDataset(Dataset):
    def __init__(self, X, Y):
            self.X = X
            self.y = Y
            
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)).to(device), self.y[idx], self.X[idx][1]

In [20]:
train_ds = NewsDataset(X_train, y_train)
valid_ds = NewsDataset(X_valid, y_valid)

In [34]:
def train_model(model):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=LEARNING_RATE)
    #torch.save(optimizer.state_dict(), "../data/optimizer_dropout_admap.pt")
    #criterion = nn.CrossEntropyLoss(weight=weights.to(device))
    for i in range(EPOCHS):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long().to(device)
            y = y.long().to(device)
            y_pred = model(x, l)
            optimizer.zero_grad()
            #loss = criterion(y_pred, y)
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item() *y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse, y_pred_list = validation_metrics(model, val_dl)
        torch.save(optimizer.state_dict(), "../data/optimizer_dropout_admap.pt")
        torch.save(model.state_dict(), "../data/bi-lstm_model.pt")
        if i%5 == 0:
            print("EPOCH ", i, ": train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
            print(classification_report(y_valid, y_pred_list))
def validation_metrics (model, valid_dl):
    #criterion = nn.CrossEntropyLoss(weight=weights)
    y_pred_list = list()
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long().to(device)
        y = y.long()
        y_hat = model(x, l).cpu()
        #loss = criterion(y_hat, y)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        for i in pred:
            tmp = int(i.numpy())
            y_pred_list.append(tmp)
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total, y_pred_list

In [36]:
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE)

In [37]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2)
        self.linear = nn.Linear(hidden_dim, 4)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [38]:
model = LSTM_glove_vecs(vocab_size, 100, 100, pretrained_weights)
#model = LSTM_variable_len(vocab_size, 50, 50)

## BiLSTM (content + title) (num_layers=2), hidden_dim=100 GloVe embeddings emb_size=100, no stop words (train + test), N=600, (batch_size=800) w/ random synonym replacement (on content + title) alpha=0.5 (80/20 split)

In [39]:
train_model(model.to(device))

EPOCH  0 : train loss 1.260, val loss 1.185, val accuracy 0.453, and val rmse 1.269
              precision    recall  f1-score   support

           0       0.60      0.58      0.59      1712
           1       0.53      0.33      0.41      1712
           2       0.31      0.11      0.16      1712
           3       0.39      0.79      0.52      1712

    accuracy                           0.45      6848
   macro avg       0.45      0.45      0.42      6848
weighted avg       0.45      0.45      0.42      6848

EPOCH  5 : train loss 1.037, val loss 1.132, val accuracy 0.500, and val rmse 1.068
              precision    recall  f1-score   support

           0       0.70      0.48      0.57      1712
           1       0.45      0.64      0.53      1712
           2       0.42      0.12      0.19      1712
           3       0.47      0.75      0.58      1712

    accuracy                           0.50      6848
   macro avg       0.51      0.50      0.47      6848
weighted avg     

EPOCH  80 : train loss 0.172, val loss 2.109, val accuracy 0.589, and val rmse 0.837
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      1712
           1       0.71      0.81      0.75      1712
           2       0.40      0.69      0.50      1712
           3       0.62      0.21      0.32      1712

    accuracy                           0.59      6848
   macro avg       0.64      0.59      0.58      6848
weighted avg       0.64      0.59      0.58      6848

EPOCH  85 : train loss 0.159, val loss 2.024, val accuracy 0.611, and val rmse 0.809
              precision    recall  f1-score   support

           0       0.83      0.73      0.78      1712
           1       0.69      0.85      0.76      1712
           2       0.41      0.62      0.50      1712
           3       0.62      0.24      0.35      1712

    accuracy                           0.61      6848
   macro avg       0.64      0.61      0.60      6848
weighted avg   

## Save off Files

In [28]:
with open('vocab_80.npy', 'wb') as f:
    np.save(f, vocab, allow_pickle=True)
with open('vocab2index_80.npy', 'wb') as f:
    np.save(f, vocab2index, allow_pickle=True)
with open('pretrained_weights_80.npy', 'wb') as f:
    np.save(f, pretrained_weights, allow_pickle=True)
    