In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from tqdm import tqdm
import nltk
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer
import pandas as pd
from termcolor import colored
from collections import Counter
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchtext.data.utils import get_tokenizer

# Poetry database. Check the cleaning_data.ipynb notebook for more details about the data collection.
url = "https://raw.githubusercontent.com/remi-vidal/NLP-ensae/main/df_cleaned.csv"
df = pd.read_csv(url, index_col=0)

In [9]:
PAD_IDX = 0#UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<pad>']#['<unk>', '<pad>', '<sos>', '<eos>']

In [98]:
lonely_content = df[df.theme == 'lonely'].content

parsed_data = lonely_content.apply(lambda x: x.lower().split("\n"))

corpus = []
for row in parsed_data:
  corpus.extend(row)

In [37]:
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [103]:
# Remove punctuation
import string

corpus = [text.translate(str.maketrans('', '', string.punctuation)) for text in corpus]

In [104]:
import en_core_web_sm
tok = en_core_web_sm.load()#spacy.load("en_core_web_sm")#get_tokenizer('spacy', language='en')#TweetTokenizer()

# stoi string to index
# itos index to string

VOC = {'stoi': {}, 'counts': {}, 'nbwords': 0}

def parse_entry(entry):
  tokens = tok(entry.lower().strip())#tok.tokenize(entry.lower().strip())
  for t in tokens:
    t = str(t)
    if t not in VOC['stoi']: 
      VOC['stoi'][t] = VOC['nbwords']
      VOC['counts'][t] = 1
      VOC['nbwords'] += 1
    else:
      VOC['counts'][t] = VOC['counts'][t] + 1


for t in special_symbols: 
  VOC['stoi'][t] = VOC['nbwords']
  VOC['counts'][t] = 1
  VOC['nbwords'] += 1

for entry in tqdm(corpus): 
  parse_entry(entry)

VOC['itos'] = { v:k for k,v in VOC['stoi'].items()}

100%|██████████| 2058/2058 [00:13<00:00, 148.19it/s]


In [105]:
def data_process(corpus, vocab):
    data = list()
    for text in corpus:
        token_list = [vocab['stoi'][str(token)] for token in tok(text.lower().strip())]#tok.tokenize(text.lower().strip())]
        for i in range(1, len(token_list)):
            n_gram_seq = torch.tensor(token_list[:i+1], dtype=torch.long)
            data.append(n_gram_seq)
    return data

train_data = data_process(corpus, VOC)

In [106]:
X = [i[:-1] for i in train_data]   # taking all the words except the last in the input set
y = [i[-1] for i in train_data]    # taking last words in the output set

In [107]:
class CustomDataset(Dataset):
    """custom dataset."""

    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.maxlen = 0
        for u in X:
          self.maxlen = max(self.maxlen, len(u))

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
      # We pad the X dynamically, so that it has the same length

        sample = F.pad(self.X[idx], (0, self.maxlen - len(self.X[idx])), "constant", VOC['stoi']['<pad>']), self.y[idx]

        return sample

In [108]:
train_dataset = CustomDataset(X, y)#TensorDataset(X, y)

In [109]:
VOCAB_SIZE = max(VOC['stoi'].values())+1
EMBEDDING_DIM = 256
HIDDEN_SIZE = 256
NUM_LAYERS = 3

BATCH_SIZE = 64
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [110]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout=0.15):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, X, h=None, c=None):
        if h is None:
            h, c = self.init_state(X.size(0))
        out = self.embedding(X)
        out, (h, c) = self.lstm(out, (h, c))
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.fc1(out)
        out = out.view(-1, X.size(1), self.vocab_size)
        out = out[:, -1]
        
        return out, h, c
    
    def init_state(self, batch_size):
        num_l = self.num_layers
        hidden = torch.zeros(num_l, batch_size, self.hidden_size).to(DEVICE)
        cell = torch.zeros(num_l, batch_size, self.hidden_size).to(DEVICE)
        return hidden, cell

In [111]:
model = Net(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [112]:
EPOCHS = 100

for epoch in range(EPOCHS):
    epoch_loss = 0
    for X, y in train_dataloader:
        X = X.to(DEVICE)
        y = y.to(DEVICE)
        
        optimizer.zero_grad()
        output, h, c = model(X)
        loss = criterion(output, y)
        epoch_loss += loss
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5) # Clipping Gradients
        optimizer.step()
    if epoch%5 == 0:
        print(f"Epoch: {epoch+1} Loss:{epoch_loss/len(train_dataloader)}")

Epoch: 1 Loss:6.804762363433838
Epoch: 6 Loss:6.373279094696045
Epoch: 11 Loss:5.57279109954834
Epoch: 16 Loss:4.659511089324951
Epoch: 21 Loss:3.8566181659698486
Epoch: 26 Loss:3.096052646636963
Epoch: 31 Loss:2.3997631072998047
Epoch: 36 Loss:1.7979499101638794
Epoch: 41 Loss:1.3026727437973022
Epoch: 46 Loss:1.0171420574188232
Epoch: 51 Loss:0.8731924891471863
Epoch: 56 Loss:0.7847435474395752
Epoch: 61 Loss:0.7471028566360474
Epoch: 66 Loss:0.6975546479225159
Epoch: 71 Loss:0.6815626621246338
Epoch: 76 Loss:0.6685712933540344
Epoch: 81 Loss:0.6910117268562317
Epoch: 86 Loss:0.6432894468307495
Epoch: 91 Loss:0.6282094717025757
Epoch: 96 Loss:0.6376839280128479


In [113]:
torch.save(model.state_dict(), "loneliness_nopunc_256.pth")

In [114]:
seed_text = "i am such a lonely man" #Starting of a song
next_words = 50

for i in range(next_words):
    token_list = np.ones(21, dtype=int)
    text_token = np.array([VOC['stoi'][str(token)] for token in tok(seed_text)])
    if len(text_token)>21:text_token = text_token[-21:]
    token_list[:len(text_token)] = text_token
    token_list = torch.from_numpy(token_list).unsqueeze(0).to(DEVICE)
    
    
    out,h,c = model(token_list)
    
    idx = torch.multinomial(nn.Softmax()(out.flatten()), 1)#torch.argmax(out)
    seed_text += " " + VOC['itos'][int(idx)]
    
for i,word in enumerate(seed_text.split()):
    print(word,end=" "),
    if i!=0 and (i+1)%5==0:
        print("\n")

  idx = torch.multinomial(nn.Softmax()(out.flatten()), 1)#torch.argmax(out)


i am such a lonely 

man hiss happily existence feeling 

visitors think always room feeling 

want suddenly glide imported gleam 

gosh rhyme speak frames cloudy 

comfort ones crown jesus dear 

enlightenment merely shine stared air 

meant shut touch help manage 

gleam creep shoulderlength speak blood 

bittersweet hates glinting drown younger 

looking hand mouths sipping feeling 

disappear 