# INM706 Deep Learning for Sequence Analysis
### Sarah Rhalem (190051884) & Stelios Kliafas (########)

Draft Notes/ Working comments:
Dataset >> Problem +Evaluation Metric >> Model

In [1]:
!pip install transformers



You should consider upgrading via the 'c:\python\python38\python.exe -m pip install --upgrade pip' command.





In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import os
import json
import torch
import re
import numpy as np
import pandas as pd
import csv
import random
import time
from torch.utils.data import Dataset, DataLoader, random_split, SequentialSampler, RandomSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AdamW, get_linear_schedule_with_warmup

In [4]:
# Set to use GPU on device if available:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [5]:
# Working directory
os.getcwd()

'C:\\Users\\public2\\Desktop\\INM706_DL_Sequence_Analysis'

In [6]:
# Load csv dataset, create listing column
raw_dataset_df= pd.read_csv(os.path.join("Data\\testing_dataset.csv") , encoding="utf8")
raw_dataset_df["listing"]= raw_dataset_df["listed_in"].str.split(pat=",", n=1).str.get(0)

# Cleanse Data
raw_dataset_df["description"].isna().sum() # Check null entries for description - None
raw_dataset_df["plot_description"]=raw_dataset_df["description"].map(lambda x: re.sub( r'"', '', x)) # (TO BE UPDATED REMOVES QUOTATION MARKS)

raw_dataset_df.listing.value_counts()

Dramas                    24
Comedies                  15
Action & Adventure        11
Documentaries             11
Horror Movies             11
International TV Shows     8
Crime TV Shows             7
British TV Shows           2
Movies                     2
Sports Movies              1
Reality TV                 1
TV Comedies                1
Anime Series               1
International Movies       1
Sci-Fi & Fantasy           1
Docuseries                 1
Independent Movies         1
Name: listing, dtype: int64

In [7]:
raw_dataset_df["description"].to_list()[0]

'In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.'

In [8]:
# Map each data sample listing to a generic genre
 
 # Identify the show listings for mapping to summarised genres
raw_dataset_df.listing.value_counts()

# map show listing to a specific genre. Note: Listing types with under ~100 data samples are classified under the genre "Other"
genre_mapping= { "<romance>": {"Romantic TV Shows", "Romantic Movies"} ,
                "<drama>": {"Dramas", "TV Dramas"}  ,
                 "<comedy>": {"Comedies", "Stand-Up Comedy", "TV Comedies", "Stand-Up Comedy & Talk Shows"},
                 "<documentary>": {"Documentaries", "Docuseries"},
                 "<action>": {"Action & Adventure", "TV Action & Adventure"} ,
                 "<international>": {"International TV Shows", "International Movies", "Spanish-Language TV Shows"},
                 "<children>": {"Children & Family Movies", "Kids' TV"},
                 "<crime>": {"Crime TV Shows"},
                 "<horror>": {"Horror Movies", "TV Horror"} ,
                 "<anime>" : {"Anime Series", "Anime Features"},
                 "<other>" : {"Thrillers", "British TV Shows", "Reality TV", "Classic & Cult TV", "TV Shows", "TV Sci-Fi & Fantasy",
                         "Classic Movies", "Movies", "Independent Movies", "Cult Movies", "Sports Movies", "LGBTQ Movies", "Music & Musicals",
                         "Sci-Fi & Fantasy"} }

# function to map listings to genres by dictionary key
def map_function(dictionary):
    def my_map(x):
        res = ""
        for key in dictionary.keys():
            if (x in dictionary[key]):
                res = key
                break
        return res
    return my_map

# Add genre column based on listing mapping
raw_dataset_df["genre"] = raw_dataset_df["listing"].map(map_function(genre_mapping))

# Write to txt file
plot_dataset_df= raw_dataset_df[["genre","plot_description"]].copy()
plot_dataset= plot_dataset_df.to_csv('Data\\netflix_plot_dataset.txt', index=False, header=None, sep=" ")

# Sense check - view data header and check all descriptions were mapped
print(plot_dataset_df.head())
print(plot_dataset_df.genre.unique())



             genre                                   plot_description
0  <international>  In a future where the elite inhabit an island ...
1          <drama>  After a devastating earthquake hits Mexico Cit...
2         <horror>  When an army recruit is found dead, his fellow...
3         <action>  In a postapocalyptic world, rag-doll robots hi...
4          <drama>  A brilliant group of students become card-coun...
['<international>' '<drama>' '<horror>' '<action>' '<crime>'
 '<documentary>' '<other>' '<comedy>' '<anime>']


In [9]:
 # Load model and Tokenizer
configuration = GPT2Config.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

configuration.pad_token_id = configuration.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

model= GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model= model.to(device) 


special_tokens_dict = {
                "bos_token": "<|startoftext|>",
                "eos_token": "<|endoftext|>",
                "additional_special_tokens": [
                    "<romance>",
                    "<drama>",
                    "<comedy>",
                    "<documentary>",
                    "<action>",
                    "<international>",
                    "<children>",
                    "<crime>",
                    "<horror>",
                    "<anime>",
                    "<other>",
                ],
            }

num_of_toks= tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_of_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer))



We have added 12 tokens


Embedding(50269, 768)

In [10]:
# Dataset Class
class NetflixPlotDataset(Dataset):
          def __init__(self, tokenizer=tokenizer, dataset_path=os.path.join("Data\\netflix_plot_dataset.txt"), block_size=768): # block_size missing
   
              with open(dataset_path, encoding="utf-8") as f:
                      lines = ["<|startoftext|>"+line+"<|endoftext|>" for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

              self.input_ids = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size, truncation=True, padding='max_length')["input_ids"] 
                
        
          def __len__(self):
            return len(self.input_ids)

          def __getitem__(self, i):
             return torch.tensor(self.input_ids[i], dtype=torch.long)
        

In [11]:
# Instantiate dataset and return length
dataset=NetflixPlotDataset()
dataset.__len__()

99

In [12]:
def split_datasets_and_create_dataloaders(dataset):
  
  training_validation_proportion = int(0.8 * len(dataset))
                              
  train_valid_dataset, test_dataset = random_split(dataset, [training_validation_proportion, len(dataset)- training_validation_proportion])
                              
  validation_proportion = int(0.25 * len(train_valid_dataset))

  training_dataset, validation_dataset = random_split(train_valid_dataset, [len(train_valid_dataset) - validation_proportion, validation_proportion])

  print("Number of Testing samples: ", len(test_dataset))
  print("Number of Validation samples: ", len(validation_dataset) )
  print("Number of Training samples: ", len(training_dataset))

  train_dataloader = DataLoader(
              training_dataset,
              batch_size = 5,
              shuffle = True
          )

  validation_dataloader = DataLoader(
              validation_dataset,
              batch_size = 5,
              shuffle = True
          )
  
  dataloaders = {'train_dataloader': train_dataloader, 'validation_dataloader': validation_dataloader}
  
  return dataloaders

In [13]:
dataloaders = split_datasets_and_create_dataloaders(dataset)

Number of Testing samples:  20
Number of Validation samples:  19
Number of Training samples:  60


In [14]:
def save_checkpoint(state, checkpoint_path):
  print("Saving checkpoint ... ")
  torch.save(state, checkpoint_path)
  print("Checkpoint:", checkpoint_path, "saved.")


def load_checkpoint(model, optimizer, scheduler, load_checkpoint_path):
  print("Loading checkpoint ... ")
  checkpoint = torch.load(load_checkpoint_path)
  start_epoch = checkpoint['epoch']
  model.load_state_dict(checkpoint['state_dict'])
  scheduler.load_state_dict(checkpoint['scheduler'])
  optimizer.load_state_dict(checkpoint['optimizer'])
  return model, optimizer, scheduler, start_epoch

In [15]:
def format_time(start_time,end_time):
  hours, remainder = divmod(end_time - start_time, 3600)
  minutes, seconds = divmod(remainder, 60)
  return ("{:0>2}:{:0>2}:{:0>2}".format(int(hours), int(minutes), int(seconds)))

def set_seed():
  random.seed(100)
  np.random.seed(100)
  torch.manual_seed(100)
  torch.cuda.manual_seed_all(100)

In [16]:
def training_and_validation(model, optimizer, scheduler, dataloaders, starting_epoch, epochs = 5):
  
  print("\n\n" + "-" * 15)
  print("| TRAINING... |")
  print("-" * 15)
  set_seed()

  for epoch in range(starting_epoch, epochs):
      print("")
      print('Training...')
      print("Epoch:", epoch + 1, "/", epochs )
      

      start_training_time = time.time()
      training_loss = 0
      model.train()

      for step, batch in enumerate(dataloaders['train_dataloader']):
          input_ids = batch[0].to(device)
          model.zero_grad()        
          outputs = model(input_ids, labels=input_ids)

          loss = outputs[0]
          batch_loss = loss.item()  
          training_loss += batch_loss

          if step % 10 == 0 and step != 0:
              print("Batch ", step, "/", len(dataloaders['train_dataloader']), ", Loss: ", batch_loss) 

              model.eval()
              if step % 250 == 0 and step != 0:
                samples = model.generate(
                                        bos_token_id=random.randint(1,30000),
                                        do_sample=True,   
                                        top_k=50,
                                        max_length = 200,
                                        top_p = 0.95,
                                        num_return_sequences=1,
                                        no_repeat_ngram_size = 2,
                                    )

                for i, sample in enumerate(samples):
                  print("{}".format(tokenizer.decode(sample, skip_special_tokens=True)))
              
              model.train()

          loss.backward()
          optimizer.step()
          scheduler.step()

      epoch_loss = training_loss / len(dataloaders['train_dataloader'])       
      end_epoch_time = time.time()
      epoch_time = format_time(start_training_time, end_epoch_time)

      print("Epoch Training time: ", epoch_time)
      print("")
      print("Mean Training loss: ", epoch_loss)

      print("")
      print("Validating...")

      start_validation_time = time.time()
      
      model.eval()
      validation_loss = 0
      validation_steps = 0

      for batch in dataloaders['validation_dataloader']:
          input_ids = batch[0].to(device)
          
          with torch.no_grad():        

              outputs  = model(input_ids, labels=input_ids)          
              loss = outputs[0]
              batch_loss = loss.item()  
              
          validation_loss += batch_loss        

      mean_validation_loss = validation_loss / len(dataloaders['validation_dataloader'])
      
      end_validation_time = time.time()
      epoch_validation_time = format_time(start_validation_time, end_validation_time) 
      print("Validation time: ", epoch_validation_time)  
      print("Mean Validation Loss: ", mean_validation_loss)

      if epoch % 1 == 0:
        checkpoint = {
          'state_dict': model.state_dict(),
          'optimizer': optimizer.state_dict(),
          'scheduler': scheduler.state_dict(),
          'validation_loss': mean_validation_loss,
          'training_loss': epoch_loss,
          'epoch': epoch + 1,
          }
        save_checkpoint(checkpoint, f"./checkpoint_{checkpoint['epoch']}.pth.tar")
  print("")
  print("Training Finished")

In [17]:
# First Training time... no checkpoint

steps = len(dataloaders['train_dataloader']) * 5

optimizer = AdamW(model.parameters(),
                lr = 4e-3,
                eps = 1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                          num_warmup_steps = 1e3, 
                                          num_training_steps = steps)

training_and_validation(model, optimizer, scheduler, dataloaders, 0)



---------------
| TRAINING... |
---------------

Training...
Epoch: 1 / 5
Batch  10 / 12 , Loss:  5.0036234855651855
Epoch Training time:  00:06:26

Mean Training loss:  35.321122209231056

Validating...
Validation time:  00:00:33
Mean Validation Loss:  33.611897468566895
Saving checkpoint ... 




Checkpoint: ./saved/checkpoint_1.pth.tar saved.

Training...
Epoch: 2 / 5
Batch  10 / 12 , Loss:  1.4097317457199097
Epoch Training time:  00:05:28

Mean Training loss:  2.8084890246391296

Validating...
Validation time:  00:00:39
Mean Validation Loss:  1.2599167078733444
Saving checkpoint ... 
Checkpoint: ./saved/checkpoint_2.pth.tar saved.

Training...
Epoch: 3 / 5
Batch  10 / 12 , Loss:  0.6160365343093872
Epoch Training time:  00:04:10

Mean Training loss:  0.5852350716789564

Validating...
Validation time:  00:00:29
Mean Validation Loss:  0.3207813985645771
Saving checkpoint ... 
Checkpoint: ./saved/checkpoint_3.pth.tar saved.

Training...
Epoch: 4 / 5
Batch  10 / 12 , Loss:  0.42547041177749634
Epoch Training time:  00:04:39

Mean Training loss:  0.36747600014011067

Validating...
Validation time:  00:00:39
Mean Validation Loss:  0.29345547035336494
Saving checkpoint ... 
Checkpoint: ./saved/checkpoint_4.pth.tar saved.

Training...
Epoch: 5 / 5
Batch  10 / 12 , Loss:  0.301925688

In [19]:
# Load checkpoint from last epoch and continue training

model, optimizer, scheduler, start_epoch = load_checkpoint(model, optimizer, scheduler, "./checkpoint_1.pth.tar")

training_and_validation(model, optimizer, scheduler, dataloaders, start_epoch)

Loading checkpoint ... 






---------------
| TRAINING... |
---------------

Training...
Epoch: 2 / 5


KeyboardInterrupt: 

In [None]:
# Generate Movie Plots

model.eval()

prompt = "<|startoftext|> <drama>"

outputs = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
outputs = outputs.to(device)

movie_plots = model.generate(outputs, do_sample=True, 
                             max_length=300, min_length=100,
                             top_k=50,  top_p=0.95, 
                             num_return_sequences=10, no_repeat_ngram_size = 2, 
                             repetition_penalty = 1.2
                             )

for index, movie_plot in enumerate(movie_plots):
  print("\n Sample movie plot: ", tokenizer.decode(movie_plot, skip_special_tokens=True))