# Quick & dirty prototype to evaluate what the data can potentially yield

In [82]:
# Imports
import pandas as pd
import nltk
import regex
import io
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

np.random.seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
train = pd.read_csv('train.csv')

In [3]:
headlines = train['headline'].copy()

# Cleaning

In [4]:
from nltk.corpus import stopwords
nltk.download('stopwords')

en_stop_words = stopwords.words('english')
print(en_stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def headline_to_clean_words(headline):
  headline = headline.lower()
  words = regex.findall(r'\w+', headline)
  clean_words = []
  
  # what is a more idiomatic / efficient way to do this?
  for word in words:
    if word not in en_stop_words:
      clean_words.append(word)


  return clean_words

In [6]:
clean_headlines = headlines.apply(headline_to_clean_words)

In [7]:
clean_headlines

0        [medvedev, signs, cooperation, treaties, sokhu...
1                    [local, self, governance, governance]
2        [secretary, general, council, europe, meets, g...
3                          [faction, revival, trust, ngos]
4              [osce, urges, russia, leave, gudauta, base]
                               ...                        
24406          [eu, georgia, conclude, free, trade, talks]
24407    [georgian, patriarch, visits, moscow, kiev, mi...
24408            [change, party, funding, rules, proposed]
24409                  [georgia, 1h, 2013, foreign, trade]
24410    [govt, gets, advice, handle, illegal, surveill...
Name: headline, Length: 24411, dtype: object

# Building the vocabulary word_context pairs

In [8]:
vocab = clean_headlines.explode().unique()
vocab_len = len(vocab)

In [9]:
word_to_idx = {word:idx for (idx, word) in enumerate(vocab)}

In [10]:
def one_hot_encode(word_idx):
  ohe = torch.zeros((vocab_len, 1), dtype = torch.float)
  ohe[word_idx] = 1

  return ohe

In [11]:
def get_word_context_pairs(words):
  word_context_pairs = []
  for i in range(0, len(words) - 1):
    for j in range(i + 1, len(words)):
      if i != j:
        word = word_to_idx[words[i]]
        context = word_to_idx[words[j]]
        word_context_pairs.append((word, context))
        word_context_pairs.append((context, word))
        
  return word_context_pairs

In [12]:
word_context_pairs = clean_headlines.apply(get_word_context_pairs).explode().dropna().reset_index(drop = True)

In [13]:
word_context_pairs.head()

0    (0, 1)
1    (1, 0)
2    (0, 2)
3    (2, 0)
4    (0, 3)
Name: headline, dtype: object

# Word2Vec

In [14]:
class Word2Vec(nn.Module):
  EMBED_DIM = 300
  def __init__(self):
    super().__init__()

    self.fc1 = nn.Linear(vocab_len, self.EMBED_DIM)
    self.fc2 = nn.Linear(self.EMBED_DIM, vocab_len)
    
    initrange = 0.5
    self.fc1.weight.data.uniform_(-initrange, initrange)
    self.fc2.weight.data.uniform_(-initrange, initrange)

    
  def forward(self, X):
    X = X.view(-1, vocab_len)

    emb = self.fc1(X)
    X = self.fc2(emb)
    X = F.log_softmax(X, dim = 1)
    return X, emb

In [15]:
from tqdm.notebook import tqdm

class Trainer:
  def __init__(self, model):
    self.model = model
    self.model.to(device)

    self.learning_rate = 0.05
    self.epochs = 100

    self.optimizer = optim.SGD(self.model.parameters(), self.learning_rate, momentum=0.9)
    self.criterion =  nn.NLLLoss()
    self.dataloader = DataLoader(word_context_pairs, batch_size = 64, collate_fn = self.collate_fn, shuffle=True)

  def collate_fn(self, word_context_pairs):
      words, contexts = [], []
      try:
        for word, context in word_context_pairs:
          word = one_hot_encode(word)
          #context = one_hot_encode(context).squeeze().type(torch.long)
          context = torch.tensor(context)
          words.append(word)
          contexts.append(context)
      except:
        print(word_context_pairs)
      
      return torch.stack(words), torch.stack(contexts)
  
  def train(self):
    self.model.train()
    print("Training started")
    for epoch in range(self.epochs):
      for words, contexts in tqdm(self.dataloader):
        self.optimizer.zero_grad()

        pred_contexts, embs = self.model(words.to(device))
        pred_contexts = pred_contexts.squeeze()

        loss = self.criterion(pred_contexts.to(device), contexts.to(device))

        loss.backward()
        self.optimizer.step()
    print("Training finished")

In [16]:
model = Word2Vec().to(device)
model.load_state_dict(torch.load('civilnews-train-word2vec-2022-08-05.pt'))
#trainer = Trainer(model)

<All keys matched successfully>

In [17]:
#trainer.train()

In [18]:
#torch.save(trainer.model.state_dict(), 'civilnews_word2vec.pt')

# Predict views by Average embeds (Failed)

In [19]:
def headlines_to_average_embeddings(words):
  embeds = []
  for word in words:
    word_idx = word_to_idx[word]
    word = one_hot_encode(word_idx)
    _, word_embed = model(word.to(device))
    embeds.append(word_embed.detach().cpu().numpy())

  return np.mean(embeds)

In [20]:
headlines_in_average_embeds = clean_headlines.apply(headlines_to_average_embeddings)

In [21]:
len(train) == len(headlines_in_average_embeds)

True

In [22]:
aev = pd.concat([pd.Series(headlines_in_average_embeds.values), train.views], axis = 1)
#pd.DataFrame(, train.views)

In [23]:
aev.columns = ['average_embeds', 'views']

In [24]:
aev = pd.DataFrame(aev.iloc[pd.to_numeric(aev['views'], errors = 'coerce').dropna().index, :])
aev['views'] = aev['views'].astype(float)

In [77]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

In [78]:
xgbr = XGBRegressor()

In [79]:
X, y = pd.DataFrame(aev['average_embeds']), aev['views']
X = (X - X.mean()) / (X.max() - X.min())

In [80]:
from sklearn.model_selection import cross_validate

In [81]:
cross_validate(xgbr, X, y, cv = 10, scoring = 'explained_variance')



{'fit_time': array([0.38888288, 0.38305426, 0.3848412 , 0.38515902, 0.37467551,
        0.38218546, 0.3794167 , 0.37620234, 0.3965323 , 0.62152886]),
 'score_time': array([0.00730467, 0.00822711, 0.00621939, 0.00681067, 0.00699878,
        0.00656247, 0.00702333, 0.00717139, 0.00688028, 0.00898051]),
 'test_score': array([-2.13370197e-03,  5.53929980e-04, -6.81335405e-04, -4.52953450e-03,
        -5.15985303e-03, -2.80369640e-01, -2.47259007e+00, -5.16063092e+02,
        -2.67436282e-02, -9.69880684e-05])}