<a href="https://colab.research.google.com/github/siddadel/kalidas/blob/main/Comet_Scene_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install transformers --quiet
!git clone https://github.com/allenai/comet-atomic-2020
!pip install -r ./comet-atomic-2020/requirements.txt --quiet
!wget https://storage.googleapis.com/ai2-mosaic-public/projects/mosaic-kgs/comet-atomic_2020_BART.zip
!unzip comet-atomic_2020_BART.zip

Mounted at /content/drive
[K     |████████████████████████████████| 3.1 MB 5.1 MB/s 
[K     |████████████████████████████████| 596 kB 48.3 MB/s 
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
[K     |████████████████████████████████| 895 kB 50.5 MB/s 
[K     |████████████████████████████████| 3.3 MB 46.7 MB/s 
[?25hCloning into 'comet-atomic-2020'...
remote: Enumerating objects: 166, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 166 (delta 42), reused 84 (delta 27), pack-reused 56[K
Receiving objects: 100% (166/166), 7.15 MiB | 37.37 MiB/s, done.
Resolving deltas: 100% (48/48), done.
[K     |████████████████████████████████| 43 kB 1.0 MB/s 
[K     |████████████████████████████████| 90 kB 5.0 MB/s 
[K     |████████████████████████████████| 313 kB 54.6 MB/s 
[K     |████████████████████████████████| 9.1 MB 52.2 MB/s 
[K     |████████████████████████████████| 379 kB 57.0 MB/s 
[K     |█

In [None]:
!nvidia-smi

Tue Nov 30 09:20:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
data_root = "/content/drive/MyDrive/ANLP21/scripts_txt"
#data_root = "/content/drive/MyDrive/ANLP21/scripts_sample"
output_dir= "/content/drive/MyDrive/ANLP21/exp"

In [None]:
import random
import os
import re
import spacy
from collections import Counter
from joblib import Parallel, delayed
import pandas as pd
nlp = spacy.load("en_core_web_sm")

In [None]:
def run(method):
    results = []
    files = os.listdir(data_root)
    random.shuffle(files)
    
#     [method(filename, open(os.path.join(data_root, filename), 'r', encoding = "utf-8").read()) for filename in files]
#     return results
    # results = Parallel(n_jobs=2)(delayed(method)(filename, open(os.path.join(data_root, filename), 'r', encoding = "utf-8").read()) for filename in files)
    results = [method(filename, open(os.path.join(data_root, filename), 'r', encoding = "utf-8").read()) for filename in files]
    return results


def sentence_tokens(doc):
    return [sent.text for sent in doc.sents]
    

def uppercase(txt):
#     https://stackoverflow.com/questions/4598315/regex-to-match-only-uppercase-words-with-some-exceptions
    uppercases = set(re.findall(r"\b[A-Z][A-Z]+\b", txt))
    return uppercases


def get_scenes(txt):
    scenes = re.split("INT.|EXT.|INT./EXT.", txt)
#     print(len(scenes))
    return scenes

def get_characters(doc):
    return Counter([str(ent).strip().lower() for ent in filter(lambda e: e.label_== "PERSON", doc.ents)])
    
def get_data(txt):
#     txt = txt.replace('\t'," ")
#     txt = txt.replace('\n'," ")
    scenes = get_scenes(txt)
    doc = nlp(txt)
    counts = get_characters(doc)
    return (scenes, counts)
  
def get_primary_character(characters):
    return str(characters.most_common()[0][0])

In [None]:
class Film:
    def __init__(self, name, protagonist):
        self.name = name
        self.protagonist = protagonist
        self.scenes = []
    
    def add(self, scene):
        self.scenes.append(scene)
        
class Scene:
    def __init__(self):
        self.items = []
        
    def add(self, item):
        self.items.append(item)
        
class SceneItem:
    def __init__(self):
        self.lines = []
        
    def add(self, line):
        self.lines.append(line)
        
    def __str__(self):
        return " ".join(self.lines)
    
    def is_empty(self):
        return len(self.lines)==0
    
class Dialogue(SceneItem):
    
    def __init__(self, character):
        self.lines = []
        self.character = character

    
    def __str__(self):
        return self.character +" says, \""+" ".join(self.lines)+"\""

In [None]:
#scene processing can not be parallelized because of speaking flag
def process_scene(scene_txt, characters, film):
        scene = Scene()
        film.add(scene)
        lines = scene_txt.split("\n")
        
        starts_speaking = False
        item = SceneItem()
        for line in lines:
            if(line.strip() == ""):
                if(starts_speaking):
                    starts_speaking = False
                    #sometimes an uppercase line describes things and is not dialogue
                    if(item.is_empty()):
                        temp = item.character
                        item = SceneItem()
                        item.add(temp)
                if(not item.is_empty()):
                    scene.add(item)
                item = SceneItem()
            elif(line.strip().lower() in characters.keys() or line.strip().isupper()):
                starts_speaking = True
                item = Dialogue(line.strip())
            else:
                item.add(line.strip())

def process_film(filename, txt):
    print(filename, end=", ")
    scene_txts, characters = get_data(txt)
    primary_character = get_primary_character(characters)
    film = Film(filename, primary_character)
    # [process_scene(scene_txt, characters, film) for scene_txt in scene_txts]
    # Parallel(n_jobs=4)(delayed(process_scene)(scene_txt, characters, film) for scene_txt in scene_txts)
    for scene_txt in scene_txts:
        process_scene(scene_txt, characters, film)
    return film

results = run(process_film)

snow_falling_on_cedars.txt, fear_and_loathing_in_las_vegas.txt, the_fisher_king.txt, curse_of_the_cat_people.txt, terminator.txt, ghost_ship.txt, blue_velvet.txt, u_turn.txt, 1492_conquest_of_paradise.txt, pirates_of_the_caribbean.txt, star_trek_07_generations.txt, devil_wears_prada_the.txt, on_the_waterfront.txt, kids.txt, basic.txt, alien.txt, fargo.txt, swingers.txt, gandhi.txt, peggy_sue_got_married.txt, rush_hour_2.txt, hannibal.txt, crime_spree.txt, clerks.txt, last_of_the_mohicans.txt, annie_hall.txt, being_there.txt, excalibur.txt, wild_at_heart.txt, midnight_cowboy.txt, meet_john_doe.txt, coco.txt, batman_2_unproduced.txt, one_saliva_bubble.txt, metro.txt, klute.txt, mash.txt, all_about_eve.txt, made_for_each_other.txt, boy_who_never_slept.txt, crash_1996.txt, drop_dead_gorgeous.txt, predator.txt, l.a._confidential.txt, ride_the_high_country.txt, true_believer.txt, pet_sematary.txt, the_night_of_the_hunter.txt, croupier.txt, independence_day.txt, blade_ii.txt, kafka.txt, letha

In [None]:
import sys
sys.path.insert(1,r'./comet-atomic-2020/models/comet_atomic2020_bart')
from generation_example import Comet
print("model loading ...")
comet = Comet("comet-atomic_2020_BART")
comet.model.zero_grad()
print("model loaded")

model loading ...
model loaded


In [None]:
data = []
film_id = 0
for film in results:
    film_id += 1
    scene_id = 0
    for scene in film.scenes:
        scene_id += 1
        item_id = 0
        for item in scene.items:
            item_id += 1
            character = None
            if type(item) == Dialogue:
                character = item.character
            
            data.append((film.name, scene_id, item_id, item, character, film.protagonist.upper()))


df = pd.DataFrame(data, columns=['film', 'scene', 'item_ids', 'item', 'character', 'protagonist'])
df

Unnamed: 0,film,scene,item_ids,item,character,protagonist
0,youve_got_mail.txt,1,1,You've Got Mail,,JOE
1,youve_got_mail.txt,1,2,by Nora Ephron & Delia Ephron,,JOE
2,youve_got_mail.txt,1,3,Based on:,,JOE
3,youve_got_mail.txt,1,4,The Shop Around The corner,,JOE
4,youve_got_mail.txt,1,5,by Nikolaus Laszlo,,JOE
...,...,...,...,...,...,...
905246,nurse_betty.txt,161,4,"BETTY says, ""Could I get some service here, pl...",BETTY,BETTY
905247,nurse_betty.txt,161,5,"Without looking, the waiter approaches, tops o...",,BETTY
905248,nurse_betty.txt,161,6,"POSTSCRIPT: says, ""Betty Sizemore appeared in ...",POSTSCRIPT:,BETTY
905249,nurse_betty.txt,161,7,FADE OUT:,,BETTY


In [None]:
df1 = df[~df['character'].isnull() & (df['character']==df['protagonist'])]
df1

Unnamed: 0,film,scene,item_ids,item,character,protagonist
62,youve_got_mail.txt,8,5,"JOE says, ""Mmmmmhmmm --""",JOE,JOE
65,youve_got_mail.txt,8,8,"JOE says, ""Am I going?""",JOE,JOE
67,youve_got_mail.txt,8,10,"JOE says, ""Can't I just give them money? That...",JOE,JOE
69,youve_got_mail.txt,8,12,"JOE says, ""All right, I'll go. You're late.""",JOE,JOE
174,youve_got_mail.txt,23,4,"JOE says, ""That sounds great.""",JOE,JOE
...,...,...,...,...,...,...
905222,nurse_betty.txt,156,19,"BETTY says, ""... it's too bad you're such an a...",BETTY,BETTY
905227,nurse_betty.txt,157,3,"BETTY says, ""... there's always a chance, David.""",BETTY,BETTY
905230,nurse_betty.txt,157,6,"BETTY says, ""(whispering to him) Doctor, if yo...",BETTY,BETTY
905232,nurse_betty.txt,157,8,"BETTY says, ""No, it's up to us. I love you, D...",BETTY,BETTY


In [None]:
relations = ["CapableOf",  "Desires",  "MotivatedByGoal",  "xAttr", "xNeed", "xReact", "xReason", "xWant"]

vocabulary = set()

film_bags = {}

def register_in_bag(film, token, relation):
    if film not in film_bags:
      film_bags[film] = {}
    if(relation not in film_bags[film]):
      film_bags[film][relation] = set()
    film_bags[film][relation].add(token)
    vocabulary.add(token)
    
def process_results(film, results, relations):
    for i in range(len(relations)):
      relation = relations[i]
      for result in results[i]:
        tokens = result.split(' ')
        for token in tokens:
          if token not in nlp.Defaults.stop_words:
            register_in_bag(film, token, relation)

log = []
def infer(film, item, relations):
    queries= ["{} {} [GEN]".format(item, relation) for relation in relations]
    results = comet.generate(queries, decode_method="beam", num_generate=5)
    if film not in log:
      print(film, end=",")
      log.append(film)
    process_results(film, results, relations)

In [None]:
df1[['film','item']].to_csv('data.csv')
from google.colab import files
files.download("data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df1 = pd.read_csv("/content/drive/MyDrive/ANLP21/data.csv")
df1.groupby(['film']).count()

Unnamed: 0_level_0,Unnamed: 0,item
film,Unnamed: 1_level_1,Unnamed: 2_level_1
10_things_i_hate_about_you.txt,218,218
12_monkeys.txt,176,176
13_days.txt,183,183
1492_conquest_of_paradise.txt,19,19
15_minutes.txt,145,145
...,...,...
wild_things.txt,144,144
willow.txt,229,229
witness.txt,146,146
xxx.txt,7,7


In [None]:
!pip install dask[dataframe] --upgrade 
import dask.dataframe as dd
from dask.multiprocessing import get

Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 5.1 MB/s 
Collecting locket
  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Installing collected packages: locket, partd, fsspec
Successfully installed fsspec-2021.11.1 locket-0.2.1 partd-1.2.0


In [None]:
df1.apply(lambda row: infer(row['film'], row['item'], relations), axis=1)

# data = df1[['film','item']]
# ddata = dd.from_pandas(data, npartitions=30)

# def myfunc(row): 
#   return infer(row['film'], row['item'], relations)


# res = ddata.map_partitions(lambda df: df.apply((lambda row: myfunc(*row)), axis=1)).compute(get=get)  


youve_got_mail.txt,the_hebrew_hammer.txt,the_cat_people.txt,backdraft.txt,trainspotting.txt,mighty_morphin_power_rangers.txt,the_english_patient.txt,pleasantville.txt,american_outlaws.txt,five_easy_pieces.txt,tremors.txt,the_woodsman.txt,crazy_love_was_committed.txt,the_game.txt,star_trek_01_the_motion_picture.txt,the_bourne_identity.txt,the_lord_of_the_rings_the_return_of_the_king.txt,the_time_machine.txt,pitch_black.txt,eight_millimeter.txt,the_african_queen.txt,mumford.txt,cast_away.txt,how_to_train_your_dragon.txt,stalag_17.txt,dragon_slayer.txt,the_princess_bride.txt,spiderman.txt,juno.txt,enemy_of_the_state.txt,it_happened_one_night.txt,platoon.txt,twelve_monkeys.txt,conquest_of_paradise_1492.txt,the_battle_of_shaker_heights.txt,new_nightmare.txt,el_mariachi.txt,some_like_it_hot.txt,deep_cover.txt,erik_the_viking.txt,a_nightmare_on_elm_street.txt,happy_birthday_wanda_june.txt,10_things_i_hate_about_you.txt,the_sting.txt,bruce_almighty.txt,ferris_buellers_day_off.txt,investigation

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

vocabulary = list(vocabulary)
film_vectors = []
for film in film_bags:
  relation_bag = [] 
  for relation in film_bags[film]:
    relation_bag.append([1 if word in film_bags[film][relation] else 0 for word in vocabulary])
  film_vectors.append(relation_bag)
            
X = np.array(film_vectors)
w, h, d = len(film_vectors), len(relations), len(vocabulary)

NameError: ignored

In [None]:
def plot_3d(X, w, h, d):
  assert X.shape == (w, h, d)
  X = X.reshape((w,h*d))

  tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=2500, random_state=23)
  new_values = tsne_model.fit_transform(X)

  x = []
  y = []
  z = []
  for value in new_values:
      x.append(value[0])
      y.append(value[1])
      z.append(value[2])
          
  plt.figure(figsize=(16, 16)) 
  ax = plt.axes(projection='3d')
  ax.scatter3D(x, y, z, cmap='viridis')


def plot_2d(X, w, h, d):
  assert X.shape == (w, h, d)
  X = X.reshape((w,h*d))
  tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
  new_values = tsne_model.fit_transform(X)

  x = []
  y = []
  for value in new_values:
      x.append(value[0])
      y.append(value[1])
          
  plt.figure(figsize=(16, 16)) 
  plt.scatter(x,y, cmap='viridis')
  plt.show()

In [None]:
plot_3d(X, w, h, d)

In [None]:
plot_2d(X, w, h, d)