In [1]:
import re
import pickle 
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset , DataLoader

In [2]:
pickle_file = open("/content/plots_text.pickle",'rb')

In [3]:
movie_file = pickle.load(pickle_file)

In [4]:
movie_file[0]

'barry is a private with the 101st airborne division of the united states army, stationed at fort campbell, kentucky. calpernia works as a showgirl at a transgender revue in nashville, tennessee when the two met in 1999. barry\'s roommate justin fisher  brings barry to the club where she performs. when barry and calpernia begin seeing each other regularly, fisher begins spreading rumors on base about their relationship, which appeared to be a violation of the military\'s "don\'t ask, don\'t tell" policy about discussing the sexual orientation of military personnel. barry faces increasing harassment and pressure, which explode into violence over fourth of july weekend. while calpernia performs in a pageant in nashville, barry is beaten to death in his sleep with a baseball bat by calvin glover, who had been goaded by fisher into committing the crime. the film ends with a discussion of the aftermath.'

In [5]:
len(movie_file)

500

In [6]:
clean_data = [re.sub("[^a-z 0-9]","",i) for i in movie_file]

In [7]:
clean_data[0]

'barry is a private with the 101st airborne division of the united states army stationed at fort campbell kentucky calpernia works as a showgirl at a transgender revue in nashville tennessee when the two met in 1999 barrys roommate justin fisher  brings barry to the club where she performs when barry and calpernia begin seeing each other regularly fisher begins spreading rumors on base about their relationship which appeared to be a violation of the militarys dont ask dont tell policy about discussing the sexual orientation of military personnel barry faces increasing harassment and pressure which explode into violence over fourth of july weekend while calpernia performs in a pageant in nashville barry is beaten to death in his sleep with a baseball bat by calvin glover who had been goaded by fisher into committing the crime the film ends with a discussion of the aftermath'

In [8]:
#create sequence 
def create_seq(text,seq_len=8):

  sequence = []

  if len(text.split()) > seq_len:
    for i in range(seq_len,len(text.split())): #(8,50)
      #select the sequnece of tokens
      seq = text.split()[i-seq_len:i+1] #(0-9,1-10,2-11,3-12,4-13)
      sequence.append(" ".join(seq))

    return sequence


  else:
    return [text]

In [9]:
seqs = [create_seq(i) for i in clean_data]

In [10]:
#create seq into a single list

seq_list = []

for i in seqs:
    for j in i:
        seq_list.append(j)
        

In [11]:
#Alternative methods

# 1.seq_list = sum(seq,[])

# 2.seq_list = [j for i in seq for j in i]

In [12]:
len(seq_list)

151501

In [13]:
#Create input and target sequence

input_seq = []
target_seq = []

for i in seq_list:
    input_seq.append(" ".join(i.split()[:-1]))
    target_seq.append(" ".join(i.split()[1:]))

In [14]:
unique_words = set(" ".join(clean_data).split())
len(unique_words)

16531

In [15]:
#Create int_to_word and word_to_int

int_to_word = {index:word for index,word in enumerate(unique_words)}
word_to_int = {word:index for index,word in int_to_word.items()}

In [16]:
vocab_size = len(int_to_word)

In [17]:
def get_int_seq(seq):
  return [word_to_int[w] for w in seq.split()]

In [18]:
input_int_seq = [get_int_seq(i) for i in input_seq]
target_int_seq = [get_int_seq(j) for j in target_seq]

In [19]:
#converting above lists into numpy arrays

input_array = np.array(input_int_seq)
target_array = np.array(target_int_seq)

In [20]:
input_array[0]

array([ 3789,  9566,   357, 14635,  8334, 12594, 16144,  8284])

In [21]:
inputs = input_array
targets = target_array

In [22]:
#train & test validation

train_ratio = 0.8
valid_ratio = (1-train_ratio)/2

total = len(input_array)

train_cutoff = int(total*train_ratio)
valid_cutoff = int((total*(1-valid_ratio))) #(50*(0.9)) 45

#numpy to tensor
train_x, train_y = torch.Tensor(inputs[:train_cutoff]).long() , torch.Tensor(targets[:train_cutoff]).long()
valid_x , valid_y = torch.Tensor(inputs[train_cutoff:valid_cutoff]).long() , torch.Tensor(targets[train_cutoff:valid_cutoff]).long()
test_x , test_y = torch.Tensor(inputs[valid_cutoff:]).long() , torch.Tensor(targets[valid_cutoff:]).long()


train_data = TensorDataset(train_x,train_y)
valid_data = TensorDataset(valid_x ,valid_y)
test_data = TensorDataset(test_x,test_y)

batch_size = 32


train_loader = DataLoader(train_data,batch_size = batch_size,shuffle=True,drop_last=True)
valid_loader = DataLoader(valid_data,batch_size = batch_size,shuffle=True,drop_last=True)
test_loader = DataLoader(test_data,batch_size = batch_size,shuffle=True,drop_last=True)

In [23]:
for x , y in train_loader:
  print(x.shape,y.shape)
  break

torch.Size([32, 8]) torch.Size([32, 8])


In [24]:
class TextGenerationLSTM(nn.Module):

  def __init__(self,n_vocab,n_embed,n_hidden,n_layers,drop_pr = 0.25):
    super().__init__()
    self.n_vocab = n_vocab
    self.n_embed = n_embed
    self.n_hidden = n_hidden 
    self.drop_pr = drop_pr 
    self.n_layers = n_layers

    self.embedding = nn.Embedding(n_vocab,n_embed)
    self.lstm = nn.LSTM(n_embed,n_hidden,n_layers,batch_first = True)
    self.dropout = nn.Dropout(drop_pr)
    self.fc = nn.Linear(n_hidden,n_vocab)

  #(input , (hidden_state & cell state))
  def forward(self,x,hidden):
    
    #pass the input through embedding layers
    embedded = self.embedding(x)

    #Get the output and new hidden state and cell state
    n_out , hidden = self.lstm(embedded,hidden) #(lstm) (input,(hidden_state,cell_state)) 

    #pass the droupout
    out = self.dropout(n_out)

    #reshape
    #out = out.contingous().view(-1,self.n_hidden)
    out = out.reshape(-1,self.n_hidden)

    out = self.fc(out)

    return out,hidden


  def init_hidden(self,batch_size):

    '''initialize the hidden and cell state with size (n_layers * batch_size * n_hidden) ,
    initialize to zero for both hidden and cell state of LSTM model '''

    #hidden layer = Total no of neurons in hidden state (hidden_state value)

    # LSTM (hidden state and cell state) (hidden state , cell (memory))
    #120 neurons state ()

    weights = next(self.parameters()).data

    if (torch.cuda.is_available()):
      hidden = (weights.new(self.n_layers , batch_size,self.n_hidden).zero_().cuda(),
         weights.new(self.n_layers,batch_size,self.n_hidden).zero_().cuda())

    else:
      hidden = (weights.new(self.n_layers , batch_size,self.n_hidden).zero_(),
         weights.new(self.n_layers,batch_size,self.n_hidden).zero_())
      
    return hidden

In [25]:
#n_vocab,n_embed,n_hidden,n_layers,drop_pr = 0.25
n_vocab = len(int_to_word)
n_embed = 200
n_layers = 3
n_hidden = 128

net = TextGenerationLSTM(n_vocab,n_embed,n_hidden,n_layers,drop_pr = 0.25)

In [26]:
net
net.cuda()

TextGenerationLSTM(
  (embedding): Embedding(16531, 200)
  (lstm): LSTM(200, 128, num_layers=3, batch_first=True)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=128, out_features=16531, bias=True)
)

In [27]:
optimizer = torch.optim.Adam(net.parameters(),lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [28]:
def acc(pred,label):
  pred = torch.round(pred)
  return torch.sum(pred==label).item()

In [29]:
def train_sentiment(net,loss_fn,optimizer,epochs=20):

  for epoch in range(epochs):
    h = net.init_hidden(batch_size=32)
    #print(h[0].shape)
    train_loss = 0.0
    train_acc = 0.0
    valid_loss = 0.0
    valid_acc = 0.0


    net.train()

    for input,labels in train_loader:
      h = tuple([i.data for i in h]) #tuple([hidden_state,cell_state])
      input , labels = input.cuda() , labels.cuda()
      #print(h[0].shape)
      # print(input.shape,labels.shape)
     
    
      output,h = net(input,h)
      optimizer.zero_grad()

      loss = loss_fn(output,labels.view(-1))
      loss.backward()
      train_loss += loss.item()

      # print(labels.shape)
      #print(output.shape)
      #accuracy = acc(output,labels.view(-1))
      #train_acc += accuracy

      optimizer.step()

    net.eval()

    for input,labels in valid_loader:
      input , labels = input.cuda() , labels.cuda()
      h = tuple([i.data for i in h]) #tuple([hidden_state,cell_state])
      output,h = net(input,h)
      

      loss = loss_fn(output,labels.view(-1))
      valid_loss += loss.item()

      # accuracy = acc(output,labels)

      # valid_acc += accuracy

    print("Epoch {} , Train_loss : {:.2f},valid_loss : {:.2f}".format(
        epoch+1 , train_loss/len(test_loader),valid_loss/len(valid_loader)))

In [31]:
train_sentiment(net,loss_fn,optimizer,epochs=20)

Epoch 1 , Train_loss : 37.10,valid_loss : 9.51
Epoch 2 , Train_loss : 34.94,valid_loss : 10.08
Epoch 3 , Train_loss : 33.28,valid_loss : 10.52
Epoch 4 , Train_loss : 31.93,valid_loss : 10.93
Epoch 5 , Train_loss : 30.82,valid_loss : 11.25
Epoch 6 , Train_loss : 29.88,valid_loss : 11.49
Epoch 7 , Train_loss : 29.05,valid_loss : 11.74
Epoch 8 , Train_loss : 28.31,valid_loss : 12.01
Epoch 9 , Train_loss : 27.64,valid_loss : 12.20
Epoch 10 , Train_loss : 27.02,valid_loss : 12.38
Epoch 11 , Train_loss : 26.49,valid_loss : 12.57
Epoch 12 , Train_loss : 26.00,valid_loss : 12.70
Epoch 13 , Train_loss : 25.55,valid_loss : 12.91
Epoch 14 , Train_loss : 25.14,valid_loss : 13.03
Epoch 15 , Train_loss : 24.77,valid_loss : 13.15
Epoch 16 , Train_loss : 24.41,valid_loss : 13.25
Epoch 17 , Train_loss : 24.08,valid_loss : 13.37
Epoch 18 , Train_loss : 23.77,valid_loss : 13.52
Epoch 19 , Train_loss : 23.48,valid_loss : 13.63
Epoch 20 , Train_loss : 23.20,valid_loss : 13.74


In [38]:
def predict(net,tkn,h = None):

  # tensor inputs
  # net.cuda()
  # net.eval()
  # h = net.init_hidden(1)

  x = np.array([[word_to_int[tkn]]])
  inputs = torch.from_numpy(x)

  # To gpu
  inputs = inputs.cuda()

  # detech hidden & cell state 
  h = tuple([i.data for i in h])

  # get the output of the model
  out , h = net(inputs,h)

  # get the token proba
  p = F.softmax(out,dim=1).data # cuda

  p = p.cpu()
  p = p.numpy()

  p = p.reshape(p.shape[1],)

  #get indices of next or top 3 values
  top_n_indx = p.argsort()[-1:][::-1][0]

  #random selct one of the two indecies 

  return int_to_word[top_n_indx] , h

In [39]:
#function generate text
def sample(net,size,prime="it is good"):

  # To GPU
  net.cuda()

  net.eval()

  #batch size as 1
  h = net.init_hidden(1)

  tokens = prime.split() # ["it","is","good"]

  #predict next token
  for t in tokens:
    token , h = predict(net,t.lower(),h)

  tokens.append(token)

  #predict subseuit toekns
  for i in range(size):
    token , h  = predict(net,tokens[-1],h)
    tokens.append(token)

  return " ".join(tokens)

In [40]:
sample(net,100,"we have to find the")

'we have to find the sword of justice and the two men back to the campfire that he will not be a violation of the militarys dont make a tooth while on the way to the park and tells him that she has been in love with her but lacks the confidence or social skills to get a chemical substance that julie developed that he is in love with her but lacks the confidence or social skills to get a chemical substance that julie developed that he is in love with her but lacks the confidence or social skills to get a chemical substance that julie'

In [41]:
sample(net,50,"calvin glover who")

'calvin glover who had been a martial arts competition the two men chase chicago to the park and tells him that she has been in love with her but lacks the confidence or social skills to get a chemical substance that julie developed that he is in love with her but lacks the confidence'