In [1]:
import string
from tqdm import tqdm as timer
import re
import copy
import pandas as pd

# Filter Data

## Step 1 : Iterate through the input files, discarding lines with fewer than 10 movies. Generate a actor to movies mapping, and a reverse movies to actor mapping

The lines are cleaned by:
- Removing unprintable characters
- Removing stuff in brackets () or {}

In [2]:
A = {}
M = {}

lc = {"data/actress_movies.txt" : 1182813, "data/actor_movies.txt" : 2167653}

printable = set(string.printable)

def clean_string(s):
    # Remove (*)
    s = re.sub(r'\([^0-9]+\)', '', s)

    # Remove {*}
    s = re.sub(r'\{.*\}', '', s)
    
    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # Strip spaces etc
    s = s.lstrip()
    s = s.rstrip()

    return s

for fname in ["data/actor_movies.txt", "data/actress_movies.txt"]:
    with open(fname, "r") as f:
        for line in timer(f, total = lc[fname], desc=fname.split("/")[1]):
            if line.count('\t\t') <= 10:
                pass
            else:
                line = filter(lambda x : x in printable, line.decode('latin1')).encode('ascii')
                splits = line.split("\t\t")
                actor_name = splits[0]
                movies = set(map(clean_string, splits[1:]))

                if actor_name in A:
                    A[actor_name] = A[actor_name].union(movies)
                else:
                    A[actor_name] = movies

                for movie in movies:
                    if movie not in M:
                        M[movie] = [actor_name]
                    else:
                        M[movie].append(actor_name)

actor_movies.txt: 100%|██████████| 2167653/2167653 [00:38<00:00, 55789.30it/s]
actress_movies.txt: 100%|██████████| 1182813/1182813 [00:15<00:00, 78299.95it/s]


In [3]:
len( M.keys() )

458005

Define a relaxation to be one iteration of removing:
- Movies with less than 10 actors (update actors accordingly)
- Actors with less than 10 movies (update movies accordingly)

Clearly this is a cyclic process. We notice that the actors and their movies stabilize in about 35 iterations!

In [5]:
# Start throwing movies away
def relax(A, M):
    
    # Don't throw Rogue nation out
    extras = M["Mission: Impossible - Rogue Nation (2015)"]
    
    pop_movies = set(filter(lambda x : len(M[x]) > 10, M.keys()))
    A = {actor : set(movies).intersection(pop_movies) for (actor, movies) in A.iteritems()}
    A = {actor : movies for (actor, movies) in A.iteritems() if len(movies) > 10}
    
    good_actors = set(A.keys())
    M = {movie : set(actors).intersection(good_actors) for (movie, actors) in M.iteritems()}
    M = {movie : actors for (movie, actors) in M.iteritems() if len(actors) > 10}
    M["Mission: Impossible - Rogue Nation (2015)"] = filter(lambda x : x in A, extras)
    print len(A.keys()), len(M.keys())
    
    return A, M

In [6]:
for i in range(35):
    A, M = relax(A, M)

49783 68161
40490 60799
36672 57451
34844 55684
33790 54588
33079 53860
32580 53304
32202 52870
31891 52506
31640 52232
31429 51980
31257 51746
31112 51580
30995 51440
30882 51312
30775 51228
30714 51163
30660 51101
30600 51048
30564 51011
30533 50982
30508 50955
30488 50932
30472 50918
30459 50903
30447 50891
30433 50873
30422 50865
30416 50857
30409 50847
30400 50837
30398 50834
30396 50834
30396 50834
30396 50834


In [6]:
# Number of actors
len(A.keys())

30396

In [7]:
# Number of movies
len(M.keys())

50834

# Write to file for next phase

In [8]:
with open('movie_map.txt', 'w') as f:
    for (actor, movies) in A.iteritems():
        f.write("%s \t\t %s \n" % (actor, '\t\t'.join(movies)))

# Clean the ratings

In [10]:
table = pd.read_csv("data/movie_rating.txt", sep="\t\t", header=None)

  if __name__ == '__main__':


In [11]:
printable = set(string.printable)

def clean_string(s):
    s = filter(lambda x : x in printable, s.decode('latin1')).encode('ascii')
    
    # Remove (*)
    s = re.sub(r'\([^0-9]+\)', '', s)

    # Remove {*}
    s = re.sub(r'\{.*\}', '', s)
    
    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s)

    # Strip spaces etc
    s = s.lstrip()
    s = s.rstrip()

    return s

table.iloc[:,0] = map(clean_string, table.iloc[:,0])

In [12]:
table = table.loc[ table[0].map(lambda x : x in M) ]

In [15]:
table.to_csv("ratings.txt", sep="\t", header=False, index=False)