In [2]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\salwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#Load model
model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else 'cpu'

In [5]:
def load_model(device):
    theme_classifier = pipeline("zero-shot-classification", model=model_name, device=device)
    return theme_classifier


In [6]:
theme_classifier = load_model(device)






In [7]:
theme_list = ["love","friendship","hope","sacrifice","success","betrayal","discipline","battle"]

In [8]:
theme_classifier("i love my friend", theme_list, multi_label=True) 

{'sequence': 'i love my friend',
 'labels': ['friendship',
  'love',
  'success',
  'sacrifice',
  'hope',
  'battle',
  'discipline',
  'betrayal'],
 'scores': [0.9966062903404236,
  0.9817240238189697,
  0.48883700370788574,
  0.33532094955444336,
  0.19325266778469086,
  0.01196574792265892,
  0.008551559410989285,
  0.007199863903224468]}

Load Dataset


In [9]:
from glob import glob

In [10]:
files = glob("..\data\subtitles\*.ass")

In [11]:
files[:5]

['..\\data\\subtitles\\Hunter x Hunter - 01.enUS.ass',
 '..\\data\\subtitles\\Hunter x Hunter - 02.enUS.ass',
 '..\\data\\subtitles\\Hunter x Hunter - 03.enUS.ass',
 '..\\data\\subtitles\\Hunter x Hunter - 04.enUS.ass',
 '..\\data\\subtitles\\Hunter x Hunter - 05.enUS.ass']

In [12]:
with open(files[0],'r') as my_file:
    lines = my_file.readlines()
    lines = lines[30:]
    lines = [",".join(line.split(',')[9:]) for line in lines ]

In [13]:
lines[:5]

['Fearsome monsters... Exotic creatures...\n',
 'Vast riches... Hidden treasures...\n',
 'Evil enclaves... Unexplored lands...\n',
 'The word "unknown" holds magic.\n',
 'And some incredible people are \\Ndrawn to that magic.\n']

In [14]:
lines = [ line.replace('\n',' ') for line in lines ]
lines = [ line.replace('\\N',' ') for line in lines ]


In [15]:
lines[:10]

['Fearsome monsters... Exotic creatures... ',
 'Vast riches... Hidden treasures... ',
 'Evil enclaves... Unexplored lands... ',
 'The word "unknown" holds magic. ',
 'And some incredible people are  drawn to that magic. ',
 'They are known... ',
 '...as Hunters! ',
 'Departure x And x Friends ',
 '{\\an8\\fad(601,580)}Whale Island ',
 "Gon's still trying to catch the Lord of the Lake? "]

In [16]:
" ".join(lines[:10])

'Fearsome monsters... Exotic creatures...  Vast riches... Hidden treasures...  Evil enclaves... Unexplored lands...  The word "unknown" holds magic.  And some incredible people are  drawn to that magic.  They are known...  ...as Hunters!  Departure x And x Friends  {\\an8\\fad(601,580)}Whale Island  Gon\'s still trying to catch the Lord of the Lake? '

In [17]:
int(files[0].split('-')[-1].split('.')[0].strip())

1

In [18]:
import pandas as pd

def load_subtitles(dataset_path):
 scripts= []
 episode_numbers=[]
 paths = glob(dataset_path+'/*.ass')
 for path in paths:
    with open(path,'r') as my_file:
      lines = my_file.readlines()
      lines = lines[25:]
      lines = [",".join(line.split(',')[9:]) for line in lines ]
     
    lines = [ line.replace('\n',' ') for line in lines ]
    lines = [ line.replace('\\N',' ') for line in lines ]
    script = " ".join(lines)
    episode_num= int(path.split('-')[-1].split('.')[0].strip())

    scripts.append(script)
    episode_numbers.append(episode_num)
  
 df = pd.DataFrame.from_dict({"episode": episode_numbers,"scripts": scripts})

 return df 




In [19]:
dataset_path = "../data/subtitles"
df = load_subtitles(dataset_path)

In [20]:
df.head()

Unnamed: 0,episode,scripts
0,1,Fearsome monsters... Exotic creatures... Vast...
1,2,Text Fearsome monsters... Exotic creatures......
2,3,Text Fearsome monsters... Exotic creatures......
3,4,Text Fearsome monsters... Exotic creatures......
4,5,Text Fearsome monsters... Exotic creatures.....


In [21]:
#Getting the first row of the scripts column
script = df.iloc[0].scripts
script

'Fearsome monsters... Exotic creatures...  Vast riches... Hidden treasures...  Evil enclaves... Unexplored lands...  The word "unknown" holds magic.  And some incredible people are  drawn to that magic.  They are known...  ...as Hunters!  Departure x And x Friends  {\\an8\\fad(601,580)}Whale Island  Gon\'s still trying to catch the Lord of the Lake?  Huh?  Y-Yes...  He\'s been at it for a week, non-stop.  You promised he could take the Hunter Exam  if he caught the Lord of the Lake?  What a fool!  Five adults couldn\'t manage to  reel in that monster.  How is a child supposed to catch that beast?  Mito-san doesn\'t want him  taking the Hunter Exam.  But Gon\'s old man was the same age  when he caught it...  He can\'t do it.  Got him!  Yes! Yes! Yes! Yes! Yes!  I got him good!  It\'s huge!  This is the Lord of the Lake?  It\'s been twenty years!  I know! It was Gon\'s father that time, right?  He\'s his father\'s son.  Gon actually caught the Lord!  Look, Mito-san!  Mito-san!  I caught 

In [22]:
#Splitting the script into sentences
script_sentences = sent_tokenize(script)
script_sentences[:10]

['Fearsome monsters...',
 'Exotic creatures...',
 'Vast riches... Hidden treasures...',
 'Evil enclaves... Unexplored lands...',
 'The word "unknown" holds magic.',
 'And some incredible people are  drawn to that magic.',
 'They are known...  ...as Hunters!',
 "Departure x And x Friends  {\\an8\\fad(601,580)}Whale Island  Gon's still trying to catch the Lord of the Lake?",
 'Huh?',
 'Y-Yes...']

In [23]:
#Batch sentences into a list of sentences of 20 words
sentences_batch_size = 20
script_batches = []
for i in range(0,len(script_sentences),sentences_batch_size):
    sent = " ".join(script_sentences[i:i+sentences_batch_size])
    script_batches.append(sent)
    

               

In [24]:
script_batches[:100]

['Fearsome monsters... Exotic creatures... Vast riches... Hidden treasures... Evil enclaves... Unexplored lands... The word "unknown" holds magic. And some incredible people are  drawn to that magic. They are known...  ...as Hunters! Departure x And x Friends  {\\an8\\fad(601,580)}Whale Island  Gon\'s still trying to catch the Lord of the Lake? Huh? Y-Yes... He\'s been at it for a week, non-stop. You promised he could take the Hunter Exam  if he caught the Lord of the Lake? What a fool! Five adults couldn\'t manage to  reel in that monster. How is a child supposed to catch that beast? Mito-san doesn\'t want him  taking the Hunter Exam. But Gon\'s old man was the same age  when he caught it... He can\'t do it. Got him! Yes!',
 "Yes! Yes! Yes! Yes! I got him good! It's huge! This is the Lord of the Lake? It's been twenty years! I know! It was Gon's father that time, right? He's his father's son. Gon actually caught the Lord! Look, Mito-san! Mito-san! I caught the Lord, as promised... So 

In [25]:
#Run the sentences through the model
theme_output = theme_classifier(script_batches[:2], theme_list, multi_label=True)

In [26]:
theme_output

[{'sequence': 'Fearsome monsters... Exotic creatures... Vast riches... Hidden treasures... Evil enclaves... Unexplored lands... The word "unknown" holds magic. And some incredible people are  drawn to that magic. They are known...  ...as Hunters! Departure x And x Friends  {\\an8\\fad(601,580)}Whale Island  Gon\'s still trying to catch the Lord of the Lake? Huh? Y-Yes... He\'s been at it for a week, non-stop. You promised he could take the Hunter Exam  if he caught the Lord of the Lake? What a fool! Five adults couldn\'t manage to  reel in that monster. How is a child supposed to catch that beast? Mito-san doesn\'t want him  taking the Hunter Exam. But Gon\'s old man was the same age  when he caught it... He can\'t do it. Got him! Yes!',
  'labels': ['betrayal',
   'sacrifice',
   'discipline',
   'hope',
   'success',
   'battle',
   'friendship',
   'love'],
  'scores': [0.9673540592193604,
   0.31436631083488464,
   0.29842954874038696,
   0.257778525352478,
   0.2138640284538269,
 

In [27]:
themes = {}
for output in theme_output:
    for label,score in zip(output['labels'],output['scores']):
        if label not in themes:
            themes[label]= []
        themes[label].append(score)

In [28]:
themes

{'betrayal': [0.9673540592193604, 0.07269688695669174],
 'sacrifice': [0.31436631083488464, 0.8510117530822754],
 'discipline': [0.29842954874038696, 0.8240034580230713],
 'hope': [0.257778525352478, 0.9597513675689697],
 'success': [0.2138640284538269, 0.9823290705680847],
 'battle': [0.17349372804164886, 0.8410558104515076],
 'friendship': [0.12347804754972458, 0.7791010141372681],
 'love': [0.0040105110965669155, 0.3383287787437439]}

In [29]:
import numpy as np
def get_themes(script):
    
    script_sentences = sent_tokenize(script)
    
    #Batch sentences into a list of sentences of 20 words
    sentences_batch_size = 20
    script_batches = []
    for i in range(0,len(script_sentences),sentences_batch_size):
       sent = " ".join(script_sentences[i:i+sentences_batch_size])
       script_batches.append(sent)
   
    #Run the model
    theme_output = theme_classifier(script_batches[:2], theme_list, multi_label=True)

    #Modify the ouput format
    themes = {}
    for output in theme_output:
     for label,score in zip(output['labels'],output['scores']):
        if label not in themes:
            themes[label]= []
        themes[label].append(score)

    #Calculating the mean value of all episodes of each theme 
    themes = {key: np.mean(np.array(value)) for key,value in themes.items()}
    
    return themes
    

In [30]:
df

Unnamed: 0,episode,scripts
0,1,Fearsome monsters... Exotic creatures... Vast...
1,2,Text Fearsome monsters... Exotic creatures......
2,3,Text Fearsome monsters... Exotic creatures......
3,4,Text Fearsome monsters... Exotic creatures......
4,5,Text Fearsome monsters... Exotic creatures.....
...,...,...
143,95,Text Looks like you guys have been waiting ...
144,96,"0,0,100,100,0,0,1,2,0,8,0040,0040,0013,0 0,0,..."
145,97,"0,0,100,100,0,0,1,2,0,8,0040,0040,0018,0 Te..."
146,98,"Text To become kings, the Chimera Ants have ..."
