In [9]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [10]:
df = pd.read_csv('nlp_bbc.csv', usecols = [1])

In [11]:
display(df)

Unnamed: 0,text
0,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...
3,yeading face newcastle in fa cup premiership s...
4,ocean s twelve raids box office ocean s twelve...
...,...
2220,cars pull down us retail figures us retail sal...
2221,kilroy unveils immigration policy ex-chatshow ...
2222,rem announce new glasgow concert us band rem h...
2223,how political squabbles snowball it s become c...


In [12]:
import spacy
import re
import torch
from torch.autograd import Variable
import torch.functional as F
import torch.nn.functional as F

ModuleNotFoundError: No module named 'spacy'

In [None]:
STOP_WORDS = set(stopwords.words('english'))

In [None]:
def cleaning(text):
    text = text.lower()
    text = re.sub("[^A-Za-z']+", ' ', text)
    if len(text) > 2: return ' '.join(word for word in text.split() if word not in stopwords)

In [None]:
df_clean = pd.DataFrame(df.text.apply(lambda x: cleaning(x)))

In [None]:
df_clean.head()

In [None]:
nlp_w = spacy.load('en', disable=['ner', 'parser'])

In [None]:
def lemmatizer(text):        
    sent = []
    doc = nlp_w(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

In [23]:
df_clean["text_lemmatize"] =  df_clean.apply(lambda x: lemmatizer(x['text']), axis=1)

In [24]:
df_clean['text_lemmatize_clean'] = df_clean['text_lemmatize'].str.replace('-PRON-', '')
tokenized = [row.split() for row in df_clean['text_lemmatize_clean']]

0       tv future hand viewer home theatre system plas...
1       worldcom boss leave book alone former worldcom...
2       tigers wary farrell gamble leicester say rush ...
3       yeade face newcastle fa cup premiership side n...
4       ocean twelve raid box office ocean twelve crim...
                              ...                        
2220    car pull  retail figure  retail sale fall janu...
2221    kilroy unveil immigration policy ex chatshow h...
2222    rem announce new glasgow concert us band rem a...
2223    political squabble snowball become commonplace...
2224    souness delight euro progress boss graeme soun...
Name: text_lemmatize_clean, Length: 2225, dtype: object


In [25]:
vocabulary = []
for sentence in tokenized:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2_idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx_2word = {idx: w for (idx, w) in enumerate(vocabulary)}
vocabulary_size = len(vocabulary)

In [26]:
idx_2word

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)






In [27]:
word2_idx

23477


In [30]:
window_size = 2
idx_pairs = []
for sentence in tokenized:
    indices = [word2_idx[word] for word in sentence]
    for center_word_pos in range(len(indices)):
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) 

In [31]:
idx_pairs[:10]

array([[0, 1],
       [0, 2],
       [1, 0],
       [1, 2],
       [1, 3],
       [2, 0],
       [2, 1],
       [2, 3],
       [2, 4],
       [3, 1]])

In [32]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [None]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 1000
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f' {epo}: {loss_val/len(idx_pairs)}')