# INM706 Deep Learning for Sequence Analysis
### Sarah Rhalem (190051884) & Stelios Kliafas (########)

Draft Notes/ Working comments:
Dataset >> Problem +Evaluation Metric >> Model

In [317]:
import os
import json
import torch
import re
import numpy as np
import pandas as pd
import csv
from torch.utils.data import Dataset, dataloader

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset


In [9]:
#Set to use GPU on device if available:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [10]:
# Working directory
os.getcwd()

'C:\\Users\\sarah\\Documents\\MSc AI 2020_2021\\INM706\\INM706_DL_Sequence_Analysis'

In [11]:
# Load csv dataset, create listing column
raw_dataset_df= pd.read_csv(os.path.join("Data\\netflix_titles.csv") , encoding="utf8")
raw_dataset_df["listing"]= raw_dataset_df["listed_in"].str.split(pat=",", n=1).str.get(0)

# Cleanse Data
raw_dataset_df["description"].isna().sum() # Check null entries for description - None
raw_dataset_df["plot_description"]=raw_dataset_df["description"].map(lambda x: re.sub( r'"', '', x)) # (TO BE UPDATED REMOVES QUOTATION MARKS)

raw_dataset_df.listing.value_counts()

Dramas                          1384
Comedies                        1074
Documentaries                    751
Action & Adventure               721
International TV Shows           690
Children & Family Movies         502
Crime TV Shows                   369
Kids' TV                         359
Stand-Up Comedy                  321
Horror Movies                    244
British TV Shows                 232
Docuseries                       194
Anime Series                     148
International Movies             114
TV Comedies                      110
Reality TV                       102
Classic Movies                    77
TV Dramas                         62
Movies                            56
Thrillers                         49
TV Action & Adventure             37
Stand-Up Comedy & Talk Shows      33
Romantic TV Shows                 28
Classic & Cult TV                 21
Independent Movies                20
Anime Features                    19
Music & Musicals                  17
C

In [298]:
raw_dataset_df["description"].to_list()[0]

'In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.'

In [349]:
# Map each data sample listing to a generic genre
 
 # Identify the show listings for mapping to summarised genres
raw_dataset_df.listing.value_counts()

# map show listing to a specific genre. Note: Listing types with under ~100 data samples are classified under the genre "Other"
genre_mapping= { "<romance>": {"Romantic TV Shows", "Romantic Movies"} ,
                "<drama>": {"Dramas", "TV Dramas"}  ,
                 "<comedy>": {"Comedies", "Stand-Up Comedy", "TV Comedies", "Stand-Up Comedy & Talk Shows"},
                 "<documentary>": {"Documentaries", "Docuseries"},
                 "<action>": {"Action & Adventure", "TV Action & Adventure"} ,
                 "<international>": {"International TV Shows", "International Movies", "Spanish-Language TV Shows"},
                 "<children>": {"Children & Family Movies", "Kids' TV"},
                 "<crime>": {"Crime TV Shows"},
                 "<horror>": {"Horror Movies", "TV Horror"} ,
                 "<anime>" : {"Anime Series", "Anime Features"},
                 "<other>" : {"Thrillers", "British TV Shows", "Reality TV", "Classic & Cult TV", "TV Shows", "TV Sci-Fi & Fantasy",
                         "Classic Movies", "Movies", "Independent Movies", "Cult Movies", "Sports Movies", "LGBTQ Movies", "Music & Musicals",
                         "Sci-Fi & Fantasy"} }

# function to map listings to genres by dictionary key
def map_function(dictionary):
    def my_map(x):
        res = ""
        for key in dictionary.keys():
            if (x in dictionary[key]):
                res = key
                break
        return res
    return my_map

# Add genre column based on listing mapping
raw_dataset_df["genre"] = raw_dataset_df["listing"].map(map_function(genre_mapping))

# Write to txt file
plot_dataset_df= raw_dataset_df[["genre","plot_description"]].copy()
plot_dataset= plot_dataset_df.to_csv('Data\\netflix_plot_dataset.txt', index=False, header=None, sep=" ")

# Sense check - view data header and check all descriptions were mapped
print(plot_dataset_df.head())
print(plot_dataset_df.genre.unique())



             genre                                   plot_description
0  <international>  In a future where the elite inhabit an island ...
1          <drama>  After a devastating earthquake hits Mexico Cit...
2         <horror>  When an army recruit is found dead, his fellow...
3         <action>  In a postapocalyptic world, rag-doll robots hi...
4          <drama>  A brilliant group of students become card-coun...
['<international>' '<drama>' '<horror>' '<action>' '<crime>'
 '<documentary>' '<other>' '<comedy>' '<anime>' '<children>' '<romance>']


In [355]:
 # Load model and Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model= GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)
model= model.to(device) 


special_tokens_dict = {
                "bos_token": "<|startoftext|>",
                "eos_token": "<|endoftext|>",
               # "pad_token": "<PAD>", # NOTE: REVIEW CAN BE REMOVED AS PAD TOKEN HAS BEEN SET TO EOS TOKEN
                "additional_special_tokens": [
                    "<romance>",
                    "<drama>",
                    "<comedy>",
                    "<documentary>",
                    "<action>",
                    "<international>",
                    "<children>",
                    "<crime>",
                    "<horror>",
                    "<anime>",
                    "<other>",
                ],
            }

num_of_toks= tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_of_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer))

We have added 12 tokens


Embedding(50269, 768)

In [356]:
# Dataset Class
class NetflixPlotDataset(Dataset):
          def __init__(self, tokenizer=tokenizer, dataset_path=os.path.join("Data\\netflix_plot_dataset.txt"), block_size=128): # block_size missing
   
              with open(dataset_path, encoding="utf-8") as f:
                      lines = ["<|startoftext|>"+line+"<|endoftext|>" for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

              self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size, truncation=True)["input_ids"] 
                
        
          def __len__(self):
            return len(self.examples)

          def __getitem__(self, i):
             return torch.tensor(self.examples[i], dtype=torch.long)
        

In [357]:
# Instantiate dataset and return length
dataset=NetflixPlotDataset()
dataset.__len__()

7788

In [358]:
# Test 1- Sample 90 from the dataset
print(dataset[90])
print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(dataset[90])))

tensor([50257, 50262,     1, 15137,  9397,  2193,   326,   484,   423, 19552,
        34843,  1956,    11,   475,  1276,  1907,   284,  1624,   340,   618,
          257, 31828, 37591, 15876, 20201,   284,  1011,   340,   625,   526,
        50256])
<|startoftext|><action>"Four brothers learn that they have inherited ancestral land, but must fight to claim it when a greedy feudal lord threatens to take it over."<|endoftext|>


In [359]:
# Test 2- Sample 2 from the dataset
print(dataset[2])
print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(dataset[2])))

tensor([50257, 50266,     1,  2215,   281,  5428, 10960,   318,  1043,  2636,
           11,   465,  5891,  5795,   389,  4137,   284,  7239,   257, 17623,
         3200,   326,   338, 36660,   511, 20712,  7022,  3047,  1413,   526,
        50256])
<|startoftext|><horror>"When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp."<|endoftext|>


##### IGNORE BELOW (WORKING NOTES:

In [252]:
tokenizer.decode(tokenizer.eos_token_id)

'<|endoftext|>'

In [61]:
input_ids[0]

tensor([   40,   716, 48718])

In [62]:
output = model.generate(input_ids, max_length=100, num_beans=5, no_repeat_ngram_size=2, early_stopping=True)

In [None]:
# try:
#     f =open('Data\\netflix_plot_dataset2.txt','x')
# except:
#     f =open('Data\\netflix_plot_dataset2.txt','x')
# for idx in range(raw_dataset_df.shape[0]):
#     f.write(raw_dataset_df.iloc[idx]['genre']+' '+raw_dataset_df.iloc[idx]['plot_description']+'\n')