#### converting the movie corpus into Corpus format 
- data source = https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html 

In [36]:
from convokit import Corpus, User, Utterance
from tqdm import tqdm

#### creating users
Each character in a movie is considered a user, metadata for them are in movie_characters_metadata.txt. There are 9,035 characters in total. 

Note that since charater names can overlap, user_id provided in the original dataset is used as username, whereas the actual charatcter name is saved in user-metadata.

For each user, metadata include the following information: 
    * name of the character.
    * idx and name of the movie this charater is from
    * gender(available for 3,774 characters)
    * position on movie credits (3,321 characters available)

In [30]:
data_dir = "../cornell_movie_dialogs_corpus/"

In [31]:
with open(data_dir + "movie_characters_metadata.txt", "r", encoding='utf-8', errors='ignore') as f:
    user_data = f.readlines()

In [25]:
user_meta = {}
for user in user_data:
    user_info = [info.strip() for info in user.split("+++$+++")]
    user_meta[user_info[0]] = {"character_name": user_info[1],
                               "movie_idx": user_info[2],
                               "movie_name": user_info[3],
                               "gender": user_info[4],
                               "credit_pos": user_info[5]}

- create an User object for each unique character in the data, which will be used to for Utterances objects later

In [26]:
corpus_users = {k: User(name = k, meta = v) for k,v in user_meta.items()}

In [21]:
len(corpus_users)

9035

- checking on meta data for an example user

In [28]:
corpus_users['u0'].meta

{'character_name': 'BIANCA',
 'movie_idx': 'm0',
 'movie_name': '10 things i hate about you',
 'gender': 'f',
 'credit_pos': '4'}

#### creating utterance objects
Utterances can be found in movie_lines.txt file. There are 304,713 utterances in total in this dataset. 

An utterance object normally expects at least:
- id: the unique id of the utterance. 
- user: the user giving the utterance.
- root: the id of the root utterance of the conversation.
- reply_to: id of the utterance this was a reply to.
- timestamp: timestamp of the utterance. 
- text: text of the utterance.

Additional information associated with the utterance, e.g., in this case, the movie this utterance is coming from, may be saved as utterance level metadata.

In [38]:
with open(data_dir + "movie_lines.txt", "r", encoding='utf-8', errors='ignore') as f:
    utterance_data = f.readlines()

In [46]:
utterance_corpus = {}
for utterance in tqdm(utterance_data):
    
    utterance_info = [info.strip() for info in utterance.split("+++$+++")]
    
    # ignoring character name since User object already has information
    idx, user, movie_id, text = utterance_info[0], utterance_info[1], utterance_info[2], utterance_info[4]
    
    meta = {'movie_id': movie_id}
    
    # root & reply_to will be updated later, timestamp is not applicable 
    utterance_corpus[idx] = Utterance(idx, corpus_users[user], None, None, None, text, meta=meta)

100%|██████████| 304713/304713 [00:02<00:00, 108504.86it/s]


In [52]:
len(utter_corpus)

304713

- the utterance object now contains utterance idx, user, and text, with movie idx as meta data

In [60]:
utterance_corpus['L1044'] 

Utterance({'id': 'L1044', 'user': User([('name', 'u2')]), 'root': None, 'reply_to': None, 'timestamp': None, 'text': 'They do to!', 'other': None, 'meta': {'movie_id': 'm0'}})

In [61]:
utterance_corpus['L1044'].meta

{'movie_id': 'm0'}

#### updating root and reply_to information to utterances
movie_conversations.txt provides the structure of conversations that organizes the above utterances. This will allow us to add the missing root and reply_to information to individual utterances. 

In [63]:
with open(data_dir + "movie_conversations.txt", "r", encoding='utf-8', errors='ignore') as f:
    convo_data = f.readlines()

In [64]:
import ast

In [65]:
for info in tqdm(convo_data):
        
    user1, user2, m, convo = [info.strip() for info in info.split("+++$+++")]

    convo_seq = ast.literal_eval(convo)
    
    # update utterance
    root = convo_seq[0]
    
    # convo_seq is a list of utterances ids, arranged in conversational order
    for i, line in enumerate(convo_seq):
        
        # sanity checking: user giving the utterance is indeed in the pair of characters provided
        if utter_corpus[line].user.name not in [user1, user2]:
            print("user mismatch in line {0}".format(i))
        
        utterance_corpus[line].root = root
        
        if i == 0:
            utterance_corpus[line].reply_to = root
        else:
            utterance_corpus[line].reply_to = convo_seq[i-1]

100%|██████████| 83097/83097 [00:02<00:00, 29433.29it/s]


- check now that utterances have root and reply_to information

In [71]:
utterance_corpus['L666499']

Utterance({'id': 'L666499', 'user': User([('name', 'u9028')]), 'root': 'L666497', 'reply_to': 'L666498', 'timestamp': None, 'text': 'How quickly can you move your artillery forward?', 'other': None, 'meta': {'movie_id': 'm616'}})

In [69]:
utterance_corpus['L666497'].meta

{'movie_id': 'm616'}

#### creating corpus from list of utterances

In [72]:
utterance_list = [utterance for k,utterance in utterance_corpus.items()]

In [74]:
movie_corpus = Corpus(utterances=utterance_list)

In [75]:
len(movie_corpus.get_conversation_ids())

83097

In [76]:
convo_ids = list(movie_corpus.get_conversation_ids())

- example conversations, note that there is no order gurantee

In [78]:
for idx in convo_ids[0:5]:
    print(movie_corpus.get_conversation(idx).get_utterance_ids())

['L1045', 'L1044']
['L985', 'L984']
['L925', 'L924']
['L872', 'L871', 'L870']
['L869', 'L868', 'L867', 'L866']


In [120]:
# sanity checking, # of utterances
len(movie_corpus.get_utterance_ids())

304713

#### adding parses for utterances

In [95]:
from convokit import Parser

In [96]:
annotator = Parser()

- this is a good place to get a progress bar

In [97]:
movie_corpus = annotator.fit_transform(movie_corpus)

#### updating Corpus level metadata:
In this dataset, there are a few sets of additional information about a total of 617 movies, e.g., genres, release year, url from which the raw sources are retrieved. These will all be saved as Corpus level meta data (since these are not utterance or conversational level information)

In [146]:
movie_corpus.meta['name'] = "Cornell Movie-Dialogs Corpus"

- urls

In [152]:
with open(data_dir + "raw_script_urls.txt", "r", encoding='utf-8', errors='ignore') as f:
    urls = f.readlines()

In [153]:
movie_meta = {}
for movie in urls:
    movie_id, title, url = [info.strip() for info in movie.split("+++$+++")]
    movie_meta[movie_id] = {'title': title, "url": url}

In [154]:
len(movie_meta)

617

- additional movie meta

In [155]:
with open(data_dir + "movie_titles_metadata.txt", "r", encoding='utf-8', errors='ignore') as f:
    movie_extra = f.readlines()

In [156]:
for movie in movie_extra:
    movie_id, title, year, rating, votes, genre  = [info.strip() for info in movie.split("+++$+++")]
    movie_meta[movie_id]['release_year'] = year
    movie_meta[movie_id]['rating'] = rating
    movie_meta[movie_id]['votes'] = votes
    movie_meta[movie_id]['genre'] = genre

In [157]:
movie_meta['m23']

{'title': 'the avengers',
 'url': 'http://www.dailyscript.com/scripts/Avengers.html',
 'release_year': '1998',
 'rating': '3.40',
 'votes': '21519',
 'genre': "['action', 'adventure', 'thriller']"}

In [159]:
movie_corpus.meta['movie_metadata'] = movie_meta

#### saving created datasets

- checking available information of the data

In [170]:
from convokit import meta_index

In [171]:
meta_index(corpus = movie_corpus)

{'utterances-index': {'movie_id': "<class 'str'>", 'parsed': 'bin'},
 'users-index': {'character_name': "<class 'str'>",
  'movie_idx': "<class 'str'>",
  'movie_name': "<class 'str'>",
  'gender': "<class 'str'>",
  'credit_pos': "<class 'str'>"},
 'conversations-index': {},
 'overall-index': {}}

- saving dataset

In [173]:
# probably another good place to add progress bar

In [172]:
movie_corpus.dump(data_dir + "movie_corpus")

- after saving, the available info from dataset can be checked directly， without loading

In [193]:
meta_index(filename = data_dir + "movie_corpus")

{'utterances-index': {'movie_id': "<class 'str'>", 'parsed': 'bin'},
 'users-index': {'character_name': "<class 'str'>",
  'movie_idx': "<class 'str'>",
  'movie_name': "<class 'str'>",
  'gender': "<class 'str'>",
  'credit_pos': "<class 'str'>"},
 'conversations-index': {},
 'overall-index': {'name': "<class 'str'>",
  'movie_metadata': "<class 'dict'>"}}