In [None]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch
from glob import glob
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load the Model

In [5]:
model_name = 'facebook/bart-large-mnli'
# 0 means it will choose GPU if any otherwise it will choose CPU
device = 0 if torch.cuda.is_available() else 'cpu'

In [None]:
def load_model(device):
    # theme_classifier is library form hugging face 
    theme_classifier = pipeline(
        "zero-shot-classification",
        model=model_name,
        device=device
    )
    return theme_classifier

In [7]:
theme_classifier = load_model(device)



In [8]:
# Lets specify the theme classification based on which we want to filter 
theme_list = ["friendship", "hope", "sacrifice", "battle", "self development", "betrayal", "love", "dialogue"]

In [None]:
# This is to test that for any given sentence is classified in which section and are we able to classify from the above list 
# theme_classifier is library form hugging face 
theme_classifier(
    "I like his move and his physique and his sense of humor",
    theme_list,
    multi_label=True
)

{'sequence': 'I like his move and his physique and his sense of humor',
 'labels': ['self development',
  'hope',
  'battle',
  'friendship',
  'love',
  'dialogue',
  'sacrifice',
  'betrayal'],
 'scores': [0.7576901316642761,
  0.2395360916852951,
  0.1464158147573471,
  0.06116967648267746,
  0.05320629104971886,
  0.0400259904563427,
  0.01041721273213625,
  3.602292781579308e-05]}

# Loading the Data

In [10]:
# We are using glob as we have multiple files 
import glob
import os
mydir = r'C:\Users\rohit\Desktop\Rohit\Data Science Projects\NLP Projects\NLP TV Series\data\Subtitles'
# files = [print(file) for file in glob.glob(mydir+"/*.ass")]
files = glob.glob(mydir + '/*.ass')
# The below code is to get the base name of the file like "Nauruto Season 1 - 01.ass" 
# files = [os.path.basename(f) for f in glob.glob(mydir + '/*.ass')]



In [11]:
files[:5]

['C:\\Users\\rohit\\Desktop\\Rohit\\Data Science Projects\\NLP Projects\\NLP TV Series\\data\\Subtitles\\Naruto Season 1 - 01.ass',
 'C:\\Users\\rohit\\Desktop\\Rohit\\Data Science Projects\\NLP Projects\\NLP TV Series\\data\\Subtitles\\Naruto Season 1 - 02.ass',
 'C:\\Users\\rohit\\Desktop\\Rohit\\Data Science Projects\\NLP Projects\\NLP TV Series\\data\\Subtitles\\Naruto Season 1 - 03.ass',
 'C:\\Users\\rohit\\Desktop\\Rohit\\Data Science Projects\\NLP Projects\\NLP TV Series\\data\\Subtitles\\Naruto Season 1 - 04.ass',
 'C:\\Users\\rohit\\Desktop\\Rohit\\Data Science Projects\\NLP Projects\\NLP TV Series\\data\\Subtitles\\Naruto Season 1 - 05.ass']

In [12]:
# Now lets open the file and read the text content from the file 
with open(files[0], 'r') as file:
    lines = file.readlines()
    # I want to read only after line number 27 as in content above things are useless 
    lines = lines[27:]
    # Now i want the text only so that can be obtain after 9 commas 
    lines = [",".join(line.split(",")[9:]) for line in lines]


In [13]:
lines[:2]

['A long time ago, a powerful demon fox\\Nappeared with nine tails.\n',
 'With its powerful tails,\n']

In [14]:
# In the above output we can see we have \\N in the output 
lines = [line.replace("\\N", " ") for line in lines]

In [15]:
lines[:2]

['A long time ago, a powerful demon fox appeared with nine tails.\n',
 'With its powerful tails,\n']

In [16]:
# Now lets try to combine the sentence which is seperated by a space 
# This is the text what we are going to feed to neural network (theme classifier) as we cant feed more than 512 as theme classifier has max of 512 token 
# So we cannot feed the whole subtitles so we need to divide them in batches of 10 or 20 or anything so that it can fit in the model 
" ".join(lines[:10])

"A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can't let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known asâ€¦ the Fourth Hokage.\n Naruto!\n"

In [17]:
# Lets now extract the episode number 
# But here the output is in list format i want the format in string so try second way 
# episode  = [file.split("-")[-1].split(".")[0].strip() for file in files[:1]]
# print(episode)
int(files[0].split("-")[-1].split(".")[0].strip())



1

In [18]:
# Lets now create the complete function from the above code 

scripts = []
episode_num = []
def load_subtitles_dataset(dataset_path):
    file_path = glob.glob(dataset_path + '/*.ass')
    for path in file_path:
        #Read the file 
        with open(path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            # I want to read only after line number 27 as in content above things are useless 
            lines = lines[27:]
            # Now i want the text only so that can be obtain after 9 commas 
            lines = [",".join(line.split(",")[9:]) for line in lines]
            # print("opened the file")

        # In the above output we can see we have \\N in the output 
        lines = [line.replace("\\N", " ") for line in lines]

        # Now lets try to combine the sentence which is seperated by a space 
        script  = " ".join(lines)
        # print("scripts joined")

        # Lets get the episode number 
        episode = int(files[0].split("-")[-1].split(".")[0].strip())
        # print("episode value")

        scripts.append(script)
        episode_num.append(episode)

        df = pd.DataFrame.from_dict({'episode':episode_num, 'script':scripts})
        # print("df value")

    return df

        



In [19]:
dataset = r'C:\Users\rohit\Desktop\Rohit\Data Science Projects\NLP Projects\NLP TV Series\data\Subtitles'
df = load_subtitles_dataset(dataset)


In [20]:
df.head()

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,1,"C'mon!\n Running like a fugitive,\n Being chas..."
2,1,"C'mon!\n Running like a fugitive,\n Being chas..."
3,1,"C'mon!\n Running like a fugitive,\n Being chas..."
4,1,"C'mon!\n Running like a fugitive,\n Being chas..."


# Run the Model

In [None]:
# Get the Scripts value 
scripts = df.iloc[0]['script']

In [None]:
# We will the complete scripts 
scripts

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [25]:
# Now lets break down to sentence
script_sentences = sent_tokenize(scripts)
script_sentences[:3]

['A long time ago, a powerful demon fox appeared with nine tails.',
 'With its powerful tails,\n it could smash mountains and create tidal waves.',
 'A band of Ninjas rose to defend their village from attack.']

In [28]:
# Batch Sentence 
# lets say batch size is 20 
sentence_batch_size = 20
scripts_batches = []
# Running a loop form 1 st sentence to last sentence and making a batch of 20 i.e sentence_batch_size
for index in range(0, len(script_sentences), sentence_batch_size):
    # Now joining all the sentence from 1st to 20th in 1st go and so on and so forth increment of 20 everytime
    sent = " ".join(script_sentences[index:sentence_batch_size+index])
    scripts_batches.append(sent)

In [33]:
scripts_batches[:5]

["A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
 'Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them! Wait! Ha ha…\n Why should I? Hey, Naruto! How did you suddenly get here, lruka Sensei? The question is what are you doing here when you should be in class now? Now listen, Naruto. You failed t

In [None]:
# Lets now try to classify it based on the theme_list where we run on the theme_classifier for one sentence 
# theme_classifier is library form hugging face 
theme_output = theme_classifier(
    scripts_batches[:2],
    theme_list, 
    multi_label = True
)
# Below we will the classification on 2 batch so on and so forth we can classify the sentence 

In [61]:
theme_output

[{'sequence': "A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
  'labels': ['dialogue',
   'betrayal',
   'battle',
   'sacrifice',
   'self development',
   'hope',
   'friendship',
   'love'],
  'scores': [0.9800739288330078,
   0.9396896362304688,
   0.8546879291534424,
   0.7349799871444702,
   0.7284946441650391,
   0.199098497629165

In [None]:
# Now lets convert the output in table format so that we can use it any way we want 
# Example for {'dialogue': [0.9800739288330078, 0.9370127320289612],
# 'betrayal': [0.9396896362304688, 0.6457238793373108]}

themes = {}
for output in theme_output:
    for label, score in zip(output['labels'], output['scores']):
        # print(label, score)
        if label not in themes:
            themes[label] = []
        themes[label].append(score)

        # print(index)
    # if output['labels'][2] == 'battle':
    #     battle.append(output['scores'][2])
    # print(output['labels'][2])
    # print(output['scores'][2])

# print(battle)

In [84]:
themes

{'dialogue': [0.9800739288330078, 0.9370127320289612],
 'betrayal': [0.9396896362304688, 0.6457238793373108],
 'battle': [0.8546879291534424, 0.6581310033798218],
 'sacrifice': [0.7349799871444702, 0.6258825063705444],
 'self development': [0.7284946441650391, 0.8678194880485535],
 'hope': [0.19909849762916565, 0.2042413204908371],
 'friendship': [0.059223175048828125, 0.08603251725435257],
 'love': [0.040261998772621155, 0.028020663186907768]}

In [96]:
# Lets have the mean of the value 
import numpy as np
themes = {key : np.mean(value) for key, value in themes.items()}

In [98]:
themes

{'dialogue': np.float64(0.9585433304309845),
 'betrayal': np.float64(0.7927067577838898),
 'battle': np.float64(0.7564094662666321),
 'sacrifice': np.float64(0.6804312467575073),
 'self development': np.float64(0.7981570661067963),
 'hope': np.float64(0.20166990906000137),
 'friendship': np.float64(0.07262784615159035),
 'love': np.float64(0.03414133097976446)}

In [None]:
# Now convert this dictionary to DataFrameme 
df1 = pd.DataFrame.from_dict(themes, orient='index', columns=['Value'])
df1.head()


Unnamed: 0,Value
dialogue,0.958543
betrayal,0.792707
battle,0.756409
sacrifice,0.680431
self development,0.798157


In [None]:
# Now lets create a function where we can have all the above roles in to a single function 
def get_themes_inference(scripts):
    # Now lets break down to sentence
    script_sentences = sent_tokenize(scripts)

    # Batch Sentence 
    # lets say batch size is 20 
    sentence_batch_size = 20
    scripts_batches = []
    # Running a loop form 1 st sentence to last sentence and making a batch of 20 i.e sentence_batch_size
    for index in range(0, len(script_sentences), sentence_batch_size):
        # Now joining all the sentence from 1st to 20th in 1st go and so on and so forth increment of 20 everytime
        sent = " ".join(script_sentences[index:sentence_batch_size+index])
        scripts_batches.append(sent)

    # Run the Model 
    # Lets now try to classify it based on the theme_list where we run on the theme_classifier for one sentence 
    # theme_classifier is library form hugging face 
    theme_output = theme_classifier(
        scripts_batches[:2],
        theme_list, 
        multi_label = True
    )
    # Wrangle the output 
    # Now lets convert the output in table format so that we can use it any way we want 
    # Example for {'dialogue': [0.9800739288330078, 0.9370127320289612],
    # 'betrayal': [0.9396896362304688, 0.6457238793373108]}

    themes = {}
    for output in theme_output:
        for label, score in zip(output['labels'], output['scores']):
            # print(label, score)
            if label not in themes:
                themes[label] = []
            themes[label].append(score)

    # Lets have the mean of the value 
    themes = {key : np.mean(value) for key, value in themes.items()}

    return themes

In [112]:
# Get the Scripts value 
scripts = df.iloc[0]['script']
get_themes_inference(scripts)

{'dialogue': np.float64(0.9585433304309845),
 'betrayal': np.float64(0.7927067577838898),
 'battle': np.float64(0.7564094662666321),
 'sacrifice': np.float64(0.6804312467575073),
 'self development': np.float64(0.7981570661067963),
 'hope': np.float64(0.20166990906000137),
 'friendship': np.float64(0.07262784615159035),
 'love': np.float64(0.03414133097976446)}

In [104]:
# Earlier we have got the script value 
df = df.head(2)
df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,1,"C'mon!\n Running like a fugitive,\n Being chas..."


In [113]:
# Now next task is to get each sentence and then apply the fucntion by using which we can classiify and get score for each sentence 
Output_themes = df['script'].apply(get_themes_inference)

In [114]:
# Now we have got the score for first 2 sentence and we are able to classify it based on scores also 
# we are using CPU so we ran 2 sentence in google golab we can ru the complete scripts
Output_themes

0    {'dialogue': 0.9585433304309845, 'betrayal': 0...
1    {'dialogue': 0.9606050252914429, 'sacrifice': ...
Name: script, dtype: object

In [117]:
# Convert to Dataframe 
theme_df = pd.DataFrame(Output_themes.tolist())

In [None]:
# pd.DataFrame.from_dict(Output_themes)

Unnamed: 0,script
0,"{'dialogue': 0.9585433304309845, 'betrayal': 0..."
1,"{'dialogue': 0.9606050252914429, 'sacrifice': ..."


In [118]:
theme_df

Unnamed: 0,dialogue,betrayal,battle,sacrifice,self development,hope,friendship,love
0,0.958543,0.792707,0.756409,0.680431,0.798157,0.20167,0.072628,0.034141
1,0.960605,0.429944,0.684843,0.570702,0.482808,0.154535,0.046261,0.17326


In [119]:
df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,1,"C'mon!\n Running like a fugitive,\n Being chas..."


In [123]:
# Now lets combine both of them 
# The below line of code means we are adding all the column in df for all theme_df both way we can do 
# theme_df[df.columns] = df
# theme_df
df[theme_df.columns] = theme_df
df


Unnamed: 0,episode,script,dialogue,betrayal,battle,sacrifice,self development,hope,friendship,love
0,1,"A long time ago, a powerful demon fox appeared...",0.958543,0.792707,0.756409,0.680431,0.798157,0.20167,0.072628,0.034141
1,1,"C'mon!\n Running like a fugitive,\n Being chas...",0.960605,0.429944,0.684843,0.570702,0.482808,0.154535,0.046261,0.17326
