# INM706 Deep Learning for Sequence Analysis
### Sarah Rhalem (190051884) & Stelios Kliafas (########)

Draft Notes/ Working comments:
Dataset >> Problem +Evaluation Metric >> Model

In [8]:
import os
import json
import torch
import re
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, dataloader

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset


In [9]:
#Set to use GPU on device if available:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [10]:
# Working directory
os.getcwd()

'C:\\Users\\sarah\\Documents\\MSc AI 2020_2021\\INM706\\INM706_DL_Sequence_Analysis'

In [11]:
# Load csv dataset, create listing column
raw_dataset_df= pd.read_csv(os.path.join("Data\\netflix_titles.csv") , encoding="utf8")
raw_dataset_df["listing"]= raw_dataset_df["listed_in"].str.split(pat=",", n=1).str.get(0)

# Cleanse Data
raw_dataset_df["description"].isna().sum() # Check null entries for description - None
raw_dataset_df["plot_description"]=raw_dataset_df["description"].map(lambda x: re.sub( r'"', '', x)) # (TO BE UPDATED REMOVES QUOTATION MARKS)

raw_dataset_df.listing.value_counts()

Dramas                          1384
Comedies                        1074
Documentaries                    751
Action & Adventure               721
International TV Shows           690
Children & Family Movies         502
Crime TV Shows                   369
Kids' TV                         359
Stand-Up Comedy                  321
Horror Movies                    244
British TV Shows                 232
Docuseries                       194
Anime Series                     148
International Movies             114
TV Comedies                      110
Reality TV                       102
Classic Movies                    77
TV Dramas                         62
Movies                            56
Thrillers                         49
TV Action & Adventure             37
Stand-Up Comedy & Talk Shows      33
Romantic TV Shows                 28
Classic & Cult TV                 21
Independent Movies                20
Anime Features                    19
Music & Musicals                  17
C

In [12]:
# Map each data sample listing to a generic genre
 
 # Identify the show listings for mapping to summarised genres
raw_dataset_df.listing.value_counts()

# map show listing to a specific genre. Note: Listing types with under ~100 data samples are classified under the genre "Other"
genre_mapping= { "romance": {"Romantic TV Shows", "Romantic Movies"} ,
                "drama": {"Dramas", "TV Dramas"}  ,
                 "comedy": {"Comedies", "Stand-Up Comedy", "TV Comedies", "Stand-Up Comedy & Talk Shows"},
                 "documentary": {"Documentaries", "Docuseries"},
                 "action": {"Action & Adventure", "TV Action & Adventure"} ,
                 "international": {"International TV Shows", "International Movies", "Spanish-Language TV Shows"},
                 "children": {"Children & Family Movies", "Kids' TV"},
                 "crime": {"Crime TV Shows"},
                 "horror": {"Horror Movies", "TV Horror"} ,
                 "anime" : {"Anime Series", "Anime Features"},
                 "other" : {"Thrillers", "British TV Shows", "Reality TV", "Classic & Cult TV", "TV Shows", "TV Sci-Fi & Fantasy",
                         "Classic Movies", "Movies", "Independent Movies", "Cult Movies", "Sports Movies", "LGBTQ Movies", "Music & Musicals",
                         "Sci-Fi & Fantasy"} }

# function to map listings to genres by dictionary key
def map_function(dictionary):
    def my_map(x):
        res = ""
        for key in dictionary.keys():
            if (x in dictionary[key]):
                res = key
                break
        return res
    return my_map

# Add genre column based on listing mapping
raw_dataset_df["genre"] = raw_dataset_df["listing"].map(map_function(genre_mapping))

# Write to txt file
plot_dataset_df= raw_dataset_df[["genre","plot_description"]].copy()
plot_dataset= plot_dataset_df.to_csv('Data\\netflix_plot_dataset.txt', index=False, header=None, sep=';' )

# Sense check - view data header and check all descriptions were mapped
print(plot_dataset_df.head())
print(plot_dataset_df.genre.unique())

           genre                                   plot_description
0  international  In a future where the elite inhabit an island ...
1          drama  After a devastating earthquake hits Mexico Cit...
2         horror  When an army recruit is found dead, his fellow...
3         action  In a postapocalyptic world, rag-doll robots hi...
4          drama  A brilliant group of students become card-coun...
['international' 'drama' 'horror' 'action' 'crime' 'documentary' 'other'
 'comedy' 'anime' 'children' 'romance']


In [15]:
 # Load model and Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model= GPT2LMHeadModel.from_pretrained('gpt2')
model= model.to(device)   
    
special_tokens_dict = {
                "bos_token": "<BOS>",
                "eos_token": "<EOS>",
                "pad_token": "<PAD>",
                "additional_special_tokens": [
                   "<romance>",
                    "<drama>",
                    "<comedy>",
                    "<documentary>",
                    "<action>",
                    "<international>",
                    "<children>",
                    "<crime>",
                    "<horror>",
                    "<anime>",
                    "<other>",
                ],
            }

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer))

We have added 14 tokens


Embedding(50271, 768)

In [21]:
# Dataset Class
class NetflixPlotDataset(Dataset):
          def __init__(self, tokenizer=tokenizer, dataset_path=os.path.join("Data\\netflix_plot_dataset.txt"), block_size=100): # block_size missing
   
              with open(dataset_path, encoding="utf-8") as f:
                      lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
   
              self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
   
          def __len__(self):
            return len(self.examples)

          def __getitem__(self, i):
             return torch.tensor(self.examples[i], dtype=torch.long)

In [22]:
dataset=NetflixPlotDataset()

In [25]:
dataset[:1]

tensor([[45609,    26,   818,   257,  2003,   810,   262,  9085, 14527,   281,
          7022, 31354,  1290,   422,   262, 18012,  1017,  5700,    11,   345,
           651,   530,  2863,   284,  4654,   262,   513,     4,  7448,   422,
          2809,   282,   273,    13]])

##### IGNORE BELOW (WORKING NOTES:

In [5]:
tokenizer.decode(tokenizer.eos_token_id)

'<|endoftext|>'

In [60]:
import torch

sentence = "I am french"
input_ids= tokenizer.encode(sentence, return_tensors='pt')

In [61]:
input_ids[0]

tensor([   40,   716, 48718])

In [62]:
output = model.generate(input_ids, max_length=100, num_beans=5, no_repeat_ngram_size=2, early_stopping=True)

In [63]:
tokenizer.decode(output[0], skip_special_tokens=True)

"I am french, I am a French person. I have a lot of friends in France, and I'm very proud of them.\n\nI'm a very good person, but I don't know how to express myself. It's not easy to do. But I know that I can do it. And I want to be able to. So I've been doing this for a long time."