<a href="https://colab.research.google.com/github/siddadel/kalidas/blob/main/Film_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
data_root = "/content/drive/MyDrive/ANLP21/scripts_txt"
#data_root = "/content/drive/MyDrive/ANLP21/scripts_sample"
output_dir= "/content/drive/MyDrive/ANLP21/exp"

Mounted at /content/drive


In [2]:
class Film:
    def __init__(self, name, protagonist):
        self.name = name
        self.protagonist = protagonist
        self.scenes = []
    
    def add(self, scene):
        self.scenes.append(scene)
        
class Scene:
    def __init__(self):
        self.items = []
        
    def add(self, item):
        self.items.append(item)
        
class SceneItem:
    def __init__(self):
        self.lines = []
        
    def add(self, line):
        self.lines.append(line)
        
    def __str__(self):
        return " ".join(self.lines)
    
    def is_empty(self):
        return len(self.lines)==0
    
class Dialogue(SceneItem):
    
    def __init__(self, character):
        self.lines = []
        self.character = character

    
    def __str__(self):
        return self.character +" says, \""+" ".join(self.lines)+"\""

In [3]:
import random
import os
import re
import spacy
from collections import Counter
from joblib import Parallel, delayed
import pandas as pd
nlp = spacy.load("en_core_web_sm")

In [4]:
def run(method):
    results = []
    files = os.listdir(data_root)
    random.shuffle(files)
    
#     [method(filename, open(os.path.join(data_root, filename), 'r', encoding = "utf-8").read()) for filename in files]
#     return results
    # results = Parallel(n_jobs=2)(delayed(method)(filename, open(os.path.join(data_root, filename), 'r', encoding = "utf-8").read()) for filename in files)
    results = [method(filename, open(os.path.join(data_root, filename), 'r', encoding = "utf-8").read()) for filename in files]
    return results


def sentence_tokens(doc):
    return [sent.text for sent in doc.sents]
    

def uppercase(txt):
#     https://stackoverflow.com/questions/4598315/regex-to-match-only-uppercase-words-with-some-exceptions
    uppercases = set(re.findall(r"\b[A-Z][A-Z]+\b", txt))
    return uppercases


def get_scenes(txt):
    scenes = re.split("INT.|EXT.|INT./EXT.", txt)
#     print(len(scenes))
    return scenes

def get_characters(doc):
    return Counter([str(ent).strip().lower() for ent in filter(lambda e: e.label_== "PERSON", doc.ents)])
    
def get_data(txt):
#     txt = txt.replace('\t'," ")
#     txt = txt.replace('\n'," ")
    scenes = get_scenes(txt)
    doc = nlp(txt)
    counts = get_characters(doc)
    return (scenes, counts)
  
def get_primary_character(characters):
    return str(characters.most_common()[0][0])

In [5]:
#scene processing can not be parallelized because of speaking flag
def process_scene(scene_txt, characters, film):
        scene = Scene()
        film.add(scene)
        lines = scene_txt.split("\n")
        
        starts_speaking = False
        item = SceneItem()
        for line in lines:
            if(line.strip() == ""):
                if(starts_speaking):
                    starts_speaking = False
                    #sometimes an uppercase line describes things and is not dialogue
                    if(item.is_empty()):
                        temp = item.character
                        item = SceneItem()
                        item.add(temp)
                if(not item.is_empty()):
                    scene.add(item)
                item = SceneItem()
            elif(line.strip().lower() in characters.keys() or line.strip().isupper()):
                starts_speaking = True
                item = Dialogue(line.strip())
            else:
                item.add(line.strip())

def process_film(filename, txt):
    print(filename, end=", ")
    scene_txts, characters = get_data(txt)
    primary_character = get_primary_character(characters)
    film = Film(filename, primary_character)
    # [process_scene(scene_txt, characters, film) for scene_txt in scene_txts]
    # Parallel(n_jobs=4)(delayed(process_scene)(scene_txt, characters, film) for scene_txt in scene_txts)
    for scene_txt in scene_txts:
        process_scene(scene_txt, characters, film)
    return film

results = run(process_film)

batman_returns.txt, you_can_count_on_me.txt, 

KeyboardInterrupt: ignored

In [None]:
data = []
film_id = 0
for film in results:
    film_id += 1
    scene_id = 0
    for scene in film.scenes:
        scene_id += 1
        item_id = 0
        for item in scene.items:
            item_id += 1
            character = None
            if type(item) == Dialogue:
                character = item.character
            
            data.append((film.name, scene_id, item_id, item, character, film.protagonist.upper()))


df = pd.DataFrame(data, columns=['film', 'scene', 'item_ids', 'item', 'character', 'protagonist'])
df

In [None]:
df1 = df[~df['character'].isnull() & (df['character']==df['protagonist'])]
df1

In [None]:
df1[['film','item']].to_csv('data.csv')
from google.colab import files
files.download("data.csv")