In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import os

In [2]:
data_df = pd.read_csv('/kaggle/input/slogan-dataset/sloganlist.csv')
data_df.head()

Unnamed: 0,Company,Slogan
0,Costa Coffee,For coffee lovers.
1,Evian,Evian. Live young.
2,Dasani,Designed to make a difference.
3,Heineken,It's all about the beer.
4,Gatorade,The Legend Continues.


In [3]:
data_df['slogan_len'] = data_df['Slogan'].apply(lambda x: len(x.split()))

In [4]:
data_df['slogan_len'].describe()

count    1162.000000
mean        3.839071
std         1.280244
min         2.000000
25%         3.000000
50%         3.000000
75%         5.000000
max        10.000000
Name: slogan_len, dtype: float64

In [5]:
unique_words = set()
for i, row in data_df.iterrows():
    for word in row['Slogan'].split():
        unique_words.add(word.lower())
        

In [6]:
words_sorted = sorted(unique_words)
word2int = {word:i for i,word in enumerate(words_sorted)}
word_array = np.array(words_sorted)
words_encoded = np.array([word2int[ch] for ch in unique_words],dtype=np.int32)

In [7]:
text = "For coffee lovers."
text = text.lower()
encoding = [word2int[word] for word in text.split()]
reverse = " ".join(word_array[enc] for enc in encoding)
print(encoding)
print(reverse)

[362, 187, 594]
for coffee lovers.


In [8]:
import torch
from torch.utils.data import Dataset
seq_length = 1
chunk_size = seq_length + 1
text_chunks = [words_encoded[i:i+chunk_size] for i in range(len(words_encoded)-chunk_size)]


In [9]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()

In [10]:
seq_dataset = TextDataset(torch.tensor(text_chunks))

  """Entry point for launching an IPython kernel.


In [11]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ',repr(''.join(word_array[seq])))
    print('Target (y): ',repr(''.join(word_array[target])))
    print()
    if i == 1:
        break

 Input (x):  'passion'
Target (y):  "everyone's"

 Input (x):  "everyone's"
Target (y):  'floaty'



In [12]:
from torch.utils.data import DataLoader
batch_size = 32
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size,
shuffle=True, drop_last=True)

In [13]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
        batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [14]:
vocab_size = len(word_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)

In [15]:
model

RNN(
  (embedding): Embedding(1132, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=1132, bias=True)
)

In [16]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [17]:
num_epochs = 5000
torch.manual_seed(1)
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 7.0425
Epoch 500 loss: 0.0241
Epoch 1000 loss: 0.0053
Epoch 1500 loss: 0.0022
Epoch 2000 loss: 0.0012
Epoch 2500 loss: 0.0008
Epoch 3000 loss: 0.0005
Epoch 3500 loss: 0.0003
Epoch 4000 loss: 0.0003
Epoch 4500 loss: 0.0002


In [18]:
from torch.distributions.categorical import Categorical
torch.manual_seed(1)
logits = torch.tensor([[1.0, 1.0, 3.0]])
print('Probabilities:',nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples.numpy())

Probabilities: [0.10650698 0.10650698 0.78698605]
[[0]
 [2]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


In [26]:
def sample(model, starting_str,len_generated_text=5, scale_factor=1.0):
    encoded_input = torch.tensor([word2int[s] for s in starting_str.split()])
    encoded_input = torch.reshape(encoded_input, (1, -1))
    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
  
    for c in range(len(starting_str.split())-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)

    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
         logits, hidden, cell = model(last_char.view(1), hidden, cell)
         logits = torch.squeeze(logits, 0)
         scaled_logits = logits * scale_factor
         m = Categorical(logits=scaled_logits)
         last_char = m.sample()
    generated_str += str(word_array[last_char])

    return generated_str


In [33]:
torch.manual_seed(1)
print(sample(model, starting_str='best'))

bestmake.
