In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from textblob import TextBlob
from datetime import datetime

import spacy
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error

from collections import Counter
import re
import string

In [None]:
EPOCHS = 200
BATCH_SIZE = 256
LEARNING_RATE = 0.001

In [None]:
# Read in CSV
df = pd.read_csv("../data/Organic_extended_finalv3.csv", sep="|")

In [None]:
# Find max retweets
max_list = list()
for index, row in df.iterrows():
    num_list = list()
    num_list = {row["1"], row["2"],row["3"], row["4"],row["5"], row["6"]}
    max_list.append(max(num_list))
df["max_retweets"] = max_list

In [None]:
# Find mean/median and size
print("calculating mean/median")
mean =  df["max_retweets"].mean()
median = df["max_retweets"].median()
print("mean: ", mean)
print("median: ", median)
print("Number of entries: ", len(df))
df['max_retweets'].min()
std = df.loc[:,"max_retweets"].std()
print("std: ", std)

In [None]:
# Remove punctuation from content
for index, row in df.iterrows():
    temp_str = row["content"].translate(str.maketrans('','',string.punctuation))
    #temp_str = ' '.join(temp_str.split()[:500])
    #df.at[index,"content"] = ' '.join(temp_str.split()[:200])
    df.at[index,"content"] = temp_str

In [None]:
# combine title and text, delete other columns
#df["full_text"] = df["title"] + ' ' + df["content"]
df["full_text"] =  df["content"]
df = df[["full_text", "max_retweets"]]

In [None]:
# 0 -> 0-0.25 quantile
# 1 -> 0.26-0.50 quantile
# 2 -> < 0.51-0.75 quantile
# 3 -> >= 0.76-1.00 quantile


#quan_dict=df.max_retweets.quantile([0.25, 0.5, 0.75])
#one_quar = quan_dict[0.25]
#two_quar = quan_dict[0.5]
#three_quar = quan_dict[0.75]

one_quar = 10
two_quar = 100
three_quar = 1000

print("1: ", one_quar, "2: ", two_quar, "3: ", three_quar)

df.loc[df['max_retweets'] <= one_quar, 'shares'] = 0
df.loc[((df['max_retweets'] > one_quar) & (df['max_retweets'] <= two_quar)), 'shares'] = 1
df.loc[((df['max_retweets'] > two_quar) & (df['max_retweets'] <= three_quar)), 'shares'] = 2
df.loc[df['max_retweets'] > three_quar, 'shares'] = 3

In [None]:
#df['content'] = df['content'].fillna('')
df['content_length'] = df['full_text'].apply(lambda x: len(x.split()))

In [None]:
# mean content length
np.mean(df['content_length'])

In [None]:
# Split Train and Test dfs
mask = np.random.rand(len(df)) < 0.8
train = df[mask]
test = df[~mask]
print("train len:", len(train), "test len: ", len(test))

## Word Tokenizer

In [None]:
#tokenization
tok = spacy.load('en')
def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [None]:
#count number of occurences of each word in train set
counts = Counter()
for index, row in train.iterrows():
    counts.update(tokenize(row['full_text']))

In [None]:
#deleting infrequnet words
print("num_words before:", len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

In [None]:
# creating vocab
vocab2index = {"":0, "UNK":1}
words = ["","UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
def encode_sentence(text, vocab2index, N=450):
    tokenized = tokenize(text)
    encoded = np.zeros(N,dtype=int)
    enc1 = np.array([vocab2index.get(word,vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

## GloVe Embeddings

In [None]:
def load_glove_vectors(glove_file="../data/glove.6B.100d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file, encoding="utf8") as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [None]:
def get_emb_matrix(pretrained, word_counts, emb_size = 100):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [None]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [None]:
train['encoded'] = train['full_text'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))
test['encoded'] = test['full_text'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))

In [None]:
sns.countplot(x = 'shares', data=df)

In [None]:
X_train, y_train = list(train['encoded']), list(train['shares'])
X_valid, y_valid = list(test['encoded']), list(test['shares'])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, X, Y):
            self.X = X
            self.y = Y
            
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)).to(device), self.y[idx], self.X[idx][1]

In [None]:
train_ds = NewsDataset(X_train, torch.LongTensor(y_train))
valid_ds = NewsDataset(X_valid, torch.LongTensor(y_valid))

In [None]:
type(y_train[0])

## Weighted Random Sampler

In [None]:
def get_class_distribution(obj):
    count_dict = {
        "0":0,
        "1":0,
        "2":0,
        "3":0
    }
    
    for i in obj:
        if i == 0.0:
            count_dict["0"] += 1
        elif i == 1.0:
            count_dict["1"] += 1
        elif i == 2.0:
            count_dict["2"] += 1
        else:
            count_dict["3"] += 1
    return count_dict

In [None]:
target_list = []

for _, t, s in train_ds:
    target_list.append(t)

target_list = torch.tensor(target_list)
target_list = target_list[torch.randperm(len(target_list))]

In [None]:
class_count = [i for i in get_class_distribution(y_train).values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float)

print(class_weights)

In [None]:
class_weights_all = class_weights[target_list]

weighted_sampler = WeightedRandomSampler(
    weights = class_weights_all, 
    num_samples = len(class_weights_all),
    replacement = True
)

In [None]:
def train_model(model):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
    for i in range(EPOCHS):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long().to(device)
            y = y.long().to(device)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item() *y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse, y_pred_list = validation_metrics(model, val_dl)
        if i%5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
    print(classification_report(y_valid, y_pred_list))
def validation_metrics (model, valid_dl):
    y_pred_list = list()
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long().to(device)
        y = y.long()
        y_hat = model(x, l).cpu()
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total, y_pred_list

In [None]:
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=weighted_sampler)
val_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE)

In [None]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 4)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
model = LSTM_glove_vecs(vocab_size, 100, 100, pretrained_weights)

## Full title + content no punc [log10 distribution]

In [None]:
train_model(model.to(device))