In [58]:
import pandas as pd
import urllib2
from bs4 import BeautifulSoup, Tag, UnicodeDammit
import re
import sys
import os
import time

In [2]:
METADATA_FILE = '../data/data/movies_metadata.csv'
metadata_frame = pd.read_csv(METADATA_FILE)

In [3]:
english_movies = list(metadata_frame[metadata_frame['original_language']=='en'].title.unique())

In [4]:
# male: he,him,his,himself,man,boy,lord,sir,father,grandfather,son,grandson,brother,husband,boyfriend,uncle,nephew
# female: she,her,hers,herself,woman,girl,lady,madam,mother,grandmother,daughter,granddaughter,sister,wife,girlfriend,aunt,niece

gender_patterns = [r"(?i)\bhe\b",
                   r"(?i)\bhim\b",
                   r"(?i)\bhis\b",
                   r"(?i)\bhimself\b",
                   r"(?i)\bman\b",
                   r"(?i)\bboy\b",
                   r"(?i)\blord\b",
                   r"(?i)\bsir\b",
                   r"(?i)\bfather\b",
                   r"(?i)\bgrandfather\b",
                   r"(?i)\bson\b", 
                   r"(?i)\bgrandson\b", 
                   r"(?i)\bbrother\b",
                   r"(?i)\bhusband\b",
                   r"(?i)\bboyfriend\b",
                   r"(?i)\buncle\b",
                   r"(?i)\bnephew\b",
                   r"(?i)\bshe\b",
                   r"(?i)\bher\b",
                   r"(?i)\bhers\b",
                   r"(?i)\bherself\b",
                   r"(?i)\bwoman\b",
                   r"(?i)\bgirl\b",
                   r"(?i)\blady\b",
                   r"(?i)\bmadam\b",
                   r"(?i)\bmother\b",
                   r"(?i)\bgrandmother\b",
                   r"(?i)\bdaughter\b", 
                   r"(?i)\bgranddaughter\b", 
                   r"(?i)\bsister\b",
                   r"(?i)\bwife\b",
                   r"(?i)\bgirlfriend\b",
                   r"(?i)\baunt\b",
                   r"(?i)\bniece\b"]

In [24]:
gender_data = []
completed_movies = []

In [34]:
for i in range(len(english_movies)):
    
    # get movie name and script url
    movie = english_movies[i]
    if movie in completed_movies:
        continue
    if movie.split()[0] == 'The':
        movie_name = '-'.join(movie.split()[1:])+',-The'
    else:
        movie_name = '-'.join(movie.split())
    script_url = 'http://www.imsdb.com/scripts/' + movie_name + '.html'
    
    # get script from IMSDB website
    try:
        html_doc = urllib2.urlopen(script_url).read()
    except:
        continue
    soup = BeautifulSoup(html_doc, 'lxml')
    script = soup.find("pre")
    if script == None:
        #write movie_name in file
        continue
    if script.find("pre"):
        if script.find('pre').text != u'\n':
            script = script.find("pre")
    script_text = u''
    for block in script.descendants:
        if(isinstance(block, Tag)):
            continue
        block = UnicodeDammit(block, soup.original_encoding).unicode_markup
        block = block.strip('\n')
        script_text += block
    
    # create bag of words for gender terms
    bow_vector = [0] * len(gender_patterns)
    for j in range(len(gender_patterns)):
        pattern = gender_patterns[j]
        bow_vector[j] = len(re.findall(pattern, script_text))
    if bow_vector == [0] * len(gender_patterns):
        #write movie_name in file
        continue
    
    # write results
    completed_movies.append(movie)
    gender_data.append([movie, script_url, bow_vector])
    sys.stdout.write('\rCompleted '+str(len(completed_movies))+' of '+str(len(english_movies))+'. ')
    
    # pause for a minute after 8 runs
    if (len(completed_movies)%8 == 0):
        time.sleep(60)

Completed 304 of 30357. 

KeyboardInterrupt: 

In [35]:
gender_dataframe = pd.DataFrame(gender_data, columns=['movie', 'url', 'vector'])

In [40]:
gender_dataframe.to_csv('../data/processed/gender_bow.csv')

In [89]:
from scipy import spatial
from sklearn.cluster import SpectralClustering
import numpy as np

In [46]:
vectors = list(gender_dataframe['vector'][:300])

In [91]:
affinity_matrix = [[0] * 300]*300

In [92]:
for i in range(len(vectors)):
    for j in range(len(vectors)):
        affinity_matrix[i][j] = 1 - spatial.distance.cosine(vectors[i], vectors[j])

In [93]:
len(affinity_matrix)

300

## Toy Example

In [72]:
v1 = list(gender_dataframe[gender_dataframe['movie']=='Toy Story']['vector'])[0]

In [73]:
v2 = list(gender_dataframe[gender_dataframe['movie']=='Braveheart']['vector'])[0]

In [83]:
v3 =  list(gender_dataframe[gender_dataframe['movie']=='Notting Hill']['vector'])[0]

In [75]:
1 - spatial.distance.cosine(v1, v2)

0.9572930887746768

In [84]:
1 - spatial.distance.cosine(v1, v3)

0.6683392206816687

In [85]:
1 - spatial.distance.cosine(v2, v3)

0.8171069568540178