Similarly, remove all movies with less than 5 actors/actresses on list, construct
a movie network according to the set of actors/actresses, with weight assigned
as the jaccard index of the actor sets of 2 movies. Now we have an undirected
network instead.

In [1]:
from tqdm import tqdm_notebook as timer
import re
import string

In [2]:
A = {}
M = {}

def clean_string(s):
    # Remove (*)
    s = re.sub(r'\([^0-9]+\)', '', s)

    # Remove {*}
    s = re.sub(r'\{.*\}', '', s)
    
    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # Strip spaces etc
    s = s.lstrip()
    s = s.rstrip()

    return s

with open("movie_map.txt", "r") as f:
    for line in timer(f, total = 30396, desc="movie_map"):
        splits = line.split("\t\t")
        
        actor_name = splits[0]
        
        actor_name = splits[0]
        movies = set(map(clean_string, splits[1:]))
        
        if actor_name in A:
            A[actor_name] = A[actor_name].union(movies)
        else:
            A[actor_name] = movies

        for movie in movies:
            if movie not in M:
                M[movie] = [actor_name]
            else:
                M[movie].append(actor_name)




In [3]:
M = {movie : set(actors) for (movie, actors) in M.iteritems()}

# Create a movie network

In [4]:
with open("movie_network.txt", "w") as f:
    for (m1, actors) in timer(M.iteritems(), total=len( M.keys() )):
        edges = set()
        
        for a in actors:
            for m2 in A[a]:
                common = M[m1].intersection(M[m2])
                
                w = float( len( common ) ) / (len(M[m1]) + len(M[m2]) + len(common))
                
                edges.add( (m2, w) )
        
        for (m2, w) in edges:
            f.write("%s\t%s\t%f\n" % (m1, m2, w))




In [5]:
%%bash
# number of edges
wc -l movie_network.txt

 55035189 movie_network.txt


# Process the movie genres

In [6]:
G = {}
printable = set(string.printable)

def clean_string(s):
    # Remove (*)
    s = re.sub(r'\([^0-9]+\)', '', s)

    # Remove {*}
    s = re.sub(r'\{.*\}', '', s)
    
    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # Strip spaces etc
    s = s.lstrip()
    s = s.rstrip()

    return s

with open("data/movie_genre.txt") as f:
    for line in timer(f, total=1010991, desc="Movie Genres"):
        line = filter(lambda x : x in printable, line.decode('latin1')).encode('ascii')
        
        splits = line.split('\t\t')
        movie = clean_string(splits[0])
        genre = clean_string(splits[1])
        
        if movie in M:
            G[movie] = genre




In [7]:
# For unidentified movies 
for m in set(M.keys()).difference(set(G.keys())):
    G[m] = "Unknown"

In [8]:
with open("genres.txt", "w") as f:
    for (movie, genre) in G.iteritems():
        f.write("%s\t%s\n" % (movie, genre))

In [9]:
vmap = {movie : i for (i, movie) in enumerate(M.keys())}
rmap = {i : movie for (i, movie) in enumerate(M.keys())}

# Write graph in Metis format

In [17]:
with open("metis/graph.txt", "w") as f:
    f.write("%d %d %d\n" % ( len(M.keys()),  55035189, 1)) # Replace with number of edges
    
    for (m1, actors) in timer(M.iteritems(), total=len( M.keys() )):
        edges = set()
        
        for a in actors:
            for m2 in A[a]:
                common = M[m1].intersection(M[m2])
                
                w = float( len( common ) ) / (len(M[m1]) + len(M[m2]) + len(common))
                
                edges.add( (m2, w) )
        
        f.write(' '.join([str(vmap[m2]) + " " + str(w) for (m2, w) in edges]) + "\n")


