In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [5]:
sep = ' \+\+\+\$\+\+\+ '

In [6]:
tolist = lambda x: [w.strip("'") for w in x.lstrip('[]').rstrip(']').split(', ')]

# Delete database

In [1]:
import pymongo

In [7]:
mydb = pymongo.MongoClient()['movie-dialogs']

mycol = mydb["movies"]
mycol.drop()

mycol = mydb["characters"]
mycol.drop()

mycol = mydb["lines"]
mycol.drop()

mycol = mydb["conversations"]
mycol.drop()

# Movies

In [8]:
movies_file = 'dati/CornellMDCorpus/movie_titles_metadata.txt'
columns = ['title', 'year', 'rating', 'votes', 'genres']
movies_raw = pd.read_csv(movies_file, sep=sep, engine='python', header=None, 
                     index_col=0, names=columns, encoding='latin1')
movies = movies_raw[columns[:-1]]
genres = [(i, w) for i, row in movies_raw.iterrows() for w in tolist(row.genres)]

In [9]:
movies_raw.head()

Unnamed: 0,title,year,rating,votes,genres
m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
m1,1492: conquest of paradise,1992,6.2,10421,"['adventure', 'biography', 'drama', 'history']"
m2,15 minutes,2001,6.1,25854,"['action', 'crime', 'drama', 'thriller']"
m3,2001: a space odyssey,1968,8.4,163227,"['adventure', 'mystery', 'sci-fi']"
m4,48 hrs.,1982,6.9,22289,"['action', 'comedy', 'crime', 'drama', 'thrill..."


In [10]:
movies_raw.shape

(617, 5)

In [11]:
movies.head()

Unnamed: 0,title,year,rating,votes
m0,10 things i hate about you,1999,6.9,62847
m1,1492: conquest of paradise,1992,6.2,10421
m2,15 minutes,2001,6.1,25854
m3,2001: a space odyssey,1968,8.4,163227
m4,48 hrs.,1982,6.9,22289


In [12]:
movies.shape

(617, 4)

In [13]:
genres[:10]

[('m0', 'comedy'),
 ('m0', 'romance'),
 ('m1', 'adventure'),
 ('m1', 'biography'),
 ('m1', 'drama'),
 ('m1', 'history'),
 ('m2', 'action'),
 ('m2', 'crime'),
 ('m2', 'drama'),
 ('m2', 'thriller')]

# Characters 

In [14]:
characters_file = 'dati/CornellMDCorpus/movie_characters_metadata.txt'
columns = ['name', 'movie', 'title', 'gender', 'pos']
characters = pd.read_csv(characters_file, sep=sep, engine='python', 
                         header=None, names=columns, index_col=0, encoding='latin1')

In [15]:
characters.head()

Unnamed: 0,name,movie,title,gender,pos
u0,BIANCA,m0,10 things i hate about you,f,4
u1,BRUCE,m0,10 things i hate about you,?,?
u2,CAMERON,m0,10 things i hate about you,m,3
u3,CHASTITY,m0,10 things i hate about you,?,?
u4,JOEY,m0,10 things i hate about you,m,6


In [16]:
characters.shape

(9035, 5)

# Movie lines

In [17]:
lines_file = 'dati/CornellMDCorpus/movie_lines.txt'
columns = ['character', 'movie', 'name', 'text']
movie_lines = pd.read_csv(lines_file, sep=sep, names=columns, header=None,
                          index_col=0, engine='python', encoding='latin1')

In [18]:
movie_lines.head()

Unnamed: 0,character,movie,name,text
L1045,u0,m0,BIANCA,They do not!
L1044,u2,m0,CAMERON,They do to!
L985,u0,m0,BIANCA,I hope so.
L984,u2,m0,CAMERON,She okay?
L925,u0,m0,BIANCA,Let's go.


In [19]:
movie_lines.shape

(304713, 4)

# Conversations

In [20]:
conversations_file = 'dati/CornellMDCorpus/movie_conversations.txt'
columns = ['character_a', 'character_b', 'movie', 'lines']
conversations = pd.read_csv(conversations_file, sep=sep, names=columns, 
                             header=None, engine='python', encoding='latin1')

In [21]:
conversations.head()

Unnamed: 0,character_a,character_b,movie,lines
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


In [22]:
conversations.shape

(83097, 4)

# To MongoDB

In [23]:
import pymongo

In [24]:
db = pymongo.MongoClient()['movie-dialogs']

In [25]:
for i, row in movies_raw.iterrows():
    record = dict(row)
    print(record)
    record['genres'] = tolist(row.genres)
    record['id'] = i
    try:
        record['year'] = int(row.year)
    except ValueError:
        del(record['year'])
    print(record)
    break

{'title': '10 things i hate about you', 'year': '1999', 'rating': 6.9, 'votes': 62847, 'genres': "['comedy', 'romance']"}
{'title': '10 things i hate about you', 'year': 1999, 'rating': 6.9, 'votes': 62847, 'genres': ['comedy', 'romance'], 'id': 'm0'}


In [26]:
movie_records = {}
for i, row in movies_raw.iterrows():
    record = dict(row)
    record['genres'] = tolist(row.genres)
    record['id'] = i
    try:
        record['year'] = int(row.year)
    except ValueError:
        del(record['year'])
    movie_records[i] = record

In [27]:
movie_collection = db['movies']
movie_collection.insert_many([x for x in movie_records.values()])

<pymongo.results.InsertManyResult at 0x1cbbe0fe2e0>

In [28]:
movie_records['m0']

{'title': '10 things i hate about you',
 'year': 1999,
 'rating': 6.9,
 'votes': 62847,
 'genres': ['comedy', 'romance'],
 'id': 'm0',
 '_id': ObjectId('63e0d1b8e78cf63e11c5e9c7')}

In [29]:
character_records = {}
for i, row in characters.iterrows():
    c = dict(row)
    c['movie'] = movie_records[row.movie]
    del(c['title'])
    try:
        del(c['movie']['_id'])
    except KeyError:
        pass
    try:
        c['pos'] = int(c['pos'])
    except ValueError:
        del(c['pos'])
    character_records[i] = c

In [30]:
characters_collection = db['characters']
characters_collection.insert_many([x for x in character_records.values()])

<pymongo.results.InsertManyResult at 0x1cbc9982fd0>

In [31]:
for i, row in movie_lines.iterrows():
    c = dict(row)
    print(c)
    break

{'character': 'u0', 'movie': 'm0', 'name': 'BIANCA', 'text': 'They do not!'}


In [32]:
character_records['u0']

{'name': 'BIANCA',
 'movie': {'title': '10 things i hate about you',
  'year': 1999,
  'rating': 6.9,
  'votes': 62847,
  'genres': ['comedy', 'romance'],
  'id': 'm0'},
 'gender': 'f',
 'pos': 4,
 '_id': ObjectId('63e0d1bbe78cf63e11c5ec30')}

In [33]:
line_records = {}
for i, row in movie_lines.iterrows():
    c = dict(row)
    c_id = c['character']
    c['character'] = character_records[c_id]
    c['character']['id'] = c_id
    del(c['movie'])
    del(c['name'])
    try:
        del(c['character']['_id'])
    except KeyError:
        pass
    c['id'] = i
    line_records[i] = c

In [34]:
line_records['L1045']

{'character': {'name': 'BIANCA',
  'movie': {'title': '10 things i hate about you',
   'year': 1999,
   'rating': 6.9,
   'votes': 62847,
   'genres': ['comedy', 'romance'],
   'id': 'm0'},
  'gender': 'f',
  'pos': 4,
  'id': 'u0'},
 'text': 'They do not!',
 'id': 'L1045'}

In [35]:
line_collection = db['lines']
line_collection.insert_many([x for x in line_records.values()])

<pymongo.results.InsertManyResult at 0x1cbcbcc0340>

In [36]:
character_records['u0'].items()

dict_items([('name', 'BIANCA'), ('movie', {'title': '10 things i hate about you', 'year': 1999, 'rating': 6.9, 'votes': 62847, 'genres': ['comedy', 'romance'], 'id': 'm0'}), ('gender', 'f'), ('pos', 4), ('id', 'u0')])

In [37]:
dict([(k, v) for k, v in character_records['u0'].items() if k not in ['movie']])

{'name': 'BIANCA', 'gender': 'f', 'pos': 4, 'id': 'u0'}

In [38]:
conversation_records = {}
for i, row in conversations.iterrows():
    c = dict(row)
    c['character_a'] = dict([(k, v) for k, v in 
                             character_records[c['character_a']].items() if k not in ['movie']])
    c['character_b'] = dict([(k, v) for k, v in 
                             character_records[c['character_b']].items() if k not in ['movie']])
    c['movie'] = movie_records[c['movie']]
    conversation_records[i] = c
    lines_raw = [line_records[l] for l in tolist(c['lines'])]
    lines = [{'line': x['id'], 'text': x['text'], 
              'character': x['character']['id'], 
              'gender': x['character']['gender']} for x in lines_raw]
    c['lines'] = lines
    c['len'] = len(lines)
    conversation_records[i] = c

In [39]:
conversations_collection = db['conversations']
conversations_collection.insert_many([x for x in conversation_records.values()])

<pymongo.results.InsertManyResult at 0x1cbcfd6eee0>