In [1]:
import pandas as pd
from tqdm import tqdm as pbar
import numpy as np
from itertools import islice
import string
from scipy.sparse import lil_matrix, csc_matrix
from itertools import combinations
from tqdm import tqdm_notebook as timer
from scipy.misc import comb
import re
from collections import defaultdict

# Generate the graph

## Step 1 : Iterate through the input files, discarding lines with fewer than 10 movies. Generate a actor to movies mapping, and a reverse movies to actor mapping

The lines are cleaned by:
- Removing unprintable characters
- Removing stuff in brackets () or {}

In [2]:
A = {}
M = {}

lc = {"data/actress_movies.txt" : 1182813, "data/actor_movies.txt" : 2167653}

printable = set(string.printable)

def clean_string(s):
    return(re.sub(r'\(.*\)|\{.*\}|\'|\"', "", s).lstrip().rstrip())

for fname in ["data/actor_movies.txt", "data/actress_movies.txt"]:
    with open(fname, "r") as f:
        for line in timer(f, total = lc[fname], desc=fname.split("/")[1]):
            if line.count('\t\t') < 10:
                pass
            else:
                line = filter(lambda x : x in printable, line.decode('latin1')).encode('ascii')
                splits = line.split("\t\t")
                actor_name = splits[0]
                movies = set(map(clean_string, splits[1:]))
                
                if actor_name in A:
                    A[actor_name] = A[actor_name].union(movies)
                else:
                    A[actor_name] = movies

                for movie in movies:
                    if movie not in M:
                        M[movie] = [actor_name]
                    else:
                        M[movie].append(actor_name)





# Next find movies with less than 10 actors and drop these movies. This could easily be a cyclic process, so we STOP here!

In [3]:
for (m, alist) in M.iteritems():
    if len(alist) <= 10 or m == '':
        for actor in alist:
            A[actor].remove(m)

A = {actor : set(filter(lambda x : x in M, movies)) for (actor, movies) in A.iteritems() if len(movies) > 10}
M = {m : filter(lambda x : x in A, alist) for (m, alist) in M.iteritems() if len(alist) >= 10}
del M['']

## Number of Actors

In [4]:
len(A.keys())

57121

## Number of Movies

In [5]:
len(M.keys())

105216

In [6]:
# Create a map from vertices to numbers (to speed up the edge creation process)
vmap = {actor : i for (i, actor) in enumerate(A.keys())}

for (i, actor) in enumerate(A.keys()):
    vmap[i] = actor

## Step 2: Generate a graph from the actor data

In [7]:
import igraph

In [8]:
with open('elist.txt', 'w') as f:
    for a in timer(A.keys(), desc="Edgelist"):
        edges = set()
        for movie in A[a]:
            for b in M[movie]:
                edges.add(vmap[b])
        
        u = vmap[a]
        for v in sorted(edges):
            f.write("%d %d\n" % (u, v))




In [9]:
g = igraph.read("elist.txt", format="edgelist")

In [10]:
for (i,V) in enumerate(g.vs):
    V["name"] = vmap[i]

In [11]:
# Weights
for e in timer(g.es, total=g.ecount(), desc="Weights"):
    u = vmap[e.source]
    v = vmap[e.target]
    e["weight"] = float( len( A[u].intersection(A[v]) ) ) / len( A[u] )




# Part 3 : Page Rank

In [12]:
res = g.personalized_pagerank(weights=g.es["weight"])

In [13]:
# TOP 10 in PageRank
for i in np.argsort(res)[::-1][:10]:
    print g.vs[i]["name"], res[i], len(A[g.vs[i]["name"]])

Flowers, Bess 0.000472505463301 806
Harris, Sam (II) 0.000387865005965 597
Miller, Harold (I) 0.000340242839069 540
Phelps, Lee (I) 0.000324947231728 627
O'Connor, Frank (I) 0.000291386463035 601
Farnum, Franklyn 0.000286517084469 500
Steers, Larry 0.00027674253104 504
Sullivan, Charles (I) 0.00027489830587 496
Sayre, Jeffrey 0.000268062042728 427
Holmes, Stuart (I) 0.00025927339796 436


## Why don't popular actors show up?

Popularity is a far more abstract concept, and requires additional information to determine.
For example, an actor may be considered to be popular, when he/she plays the lead role in several films.

Bess Flowers is considered to be the most prolific actress in hollywood, because she was an "extra" in a very large number of films. Naturally her page rank was quite high, but her popularity didn't correspond to her high page rank value.

# Part 4 : Jaccard Indices

In [14]:
g.to_undirected()

In [15]:
pairs = [(e.source, e.target) for e in g.es]
g.es["weight"] = g.similarity_jaccard(pairs = pairs)

In [16]:
len(g.es["weight"]) == g.ecount()

True