# IMDB PROJECT: DATA PREPROCESSING

### Execution style

In [1]:
startFromCleanData = True #Start with the raw data imported or the cleaned files
fastExecution = False     #Use the stored graph, position and DF of rebuild them
savingFigures = True      #Whether to save or not the figures produced

### Libraries

In [2]:
# Import Libraries
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import fa2
import math
import community
import matplotlib.cm as cm
from __future__ import division
import matplotlib.image as mpimg
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import io
from collections import Counter
from wordcloud import WordCloud
from scipy.special import zeta
import pickle
# Rendering Parameters
title_font = {'family': 'sans-serif',
        'color':  '#000000',
        'weight': 'normal',
        'size': 16,
        }
#COLORS
mBlue = "#55638A"     # For actor
fRed = "#9E1030"    # For actress

## Object Storage

In [3]:
#PICKLE
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
        

## Initialise Actors and Movie Dictionnaries

In [4]:
###################################
# Initialise a movie dictionnary
###################################

# Function. to convert movie or actor id to sting key
def idToString(iD, base): # base = "tt" for movies or "nm" for actors
    if iD<10:
        return base+"000000"+str(iD)
    if iD<100:
        return base+"00000"+str(iD)
    if iD<1000:
        return base+"0000"+str(iD)
    if iD<10000:
        return base+"000"+str(iD)
    if iD<100000:
        return base+"00"+str(iD)
    if iD<1000000:
        return base+"0"+str(iD)
    else:
        return base+str(iD)
    
# Create movie dictionnary
movieDict = {}
lastMovie = 9999999 #last movie ID
if not fastExecution:
    for i in range(lastMovie):
        movieDict[idToString(i+1,"tt")] = False
    print "Movie Dictionnary initialised"
else:
    print "Fast execution mode, movie dictionnary will be initialised later"
    

Movie Dictionnary initialised


In [5]:
###################################
# Get the movies to keep
###################################

# List of the reviews documents
listReviewsDocuments = ["train/urls_neg.txt","test/urls_neg.txt","train/urls_pos.txt","test/urls_pos.txt","train/urls_unsup.txt"]

# Fill in the dictionnary
for document in listReviewsDocuments:
    files = io.open("aclImdb/"+document, mode="r", encoding="utf-8")
    for row in files:
        w = re.findall(r'http://www.imdb.com/title/(\w*)/usercomments',row)
        movieDict[w[0]] = True


In [6]:
###################################
# Create an Actor Dict
###################################
actorDict = {}
lastActor = 29999999 #last movie ID
for i in range(lastActor):
    actorDict[idToString(i+1,"nm")] = False
print "Actor Dictionnary initialised"

Actor Dictionnary initialised


## Clean Data 
Start from the raw databases we download

In [7]:
###################################
# key to movie name file
###################################

if not startFromCleanData:
    path = "DATA/title.basics.txt"
    cleanPath = "DATA/title.basics.clean.txt"
    files = io.open(path, mode="r", encoding="utf-8")
    cleanfile = io.open(cleanPath, mode="w", encoding="utf-8")
    b=False # skip the first line
    count =0
    for row in files:
        if b:
            split=row.split("\t")
            key = split[0]
            if movieDict[key]:
                if (split[1] in ['movie', 'tvMovie']):
                    cleanfile.write(row)
                    count +=1
                else:
                    movieDict[key]=False
        else:
            b=True
    files.close()
    cleanfile.close()


    print "There are "+str(count)+" movies considered"
    print "DATA/title.basics.txt cleaned"

In [8]:
##########################################################
# film actors links file : Clean + get actor dictionnary
##########################################################

if not startFromCleanData:
    path = "DATA/title.principals.txt"
    cleanPath = "DATA/title.principals.clean.txt"
    files = io.open(path, mode="r", encoding="utf-8")
    cleanfile = io.open(cleanPath, mode="w", encoding="utf-8")
    roleCheckList = ["actor", "actress", "self"] #check if it is an actor
    nLinks = 0
    i=False # skip first line
    for row in files:
        if i:
            split = row.split("\t") 
            key = split[0]
            if movieDict[key]:
                if (split[3] in roleCheckList or split[4] in roleCheckList or split[5] in roleCheckList):
                    cleanfile.write(row)
                    actorDict[split[2]]=True
                    nLinks  +=1

        else:
            i=True

    files.close()
    cleanfile.close()

    ##REMOVE ERRORS
    actorDict["nm0547707"]=False
    actorDict['nm0547707']=False
    actorDict['nm0809728']=False
    actorDict['nm2442859']=False
    actorDict['nm1996613']=False
    actorDict['nm0600636']=False
    actorDict['nm1824417']=False
    actorDict['nm2440192']=False
    actorDict['nm1754167']=False

    print "There are "+str(nLinks-9)+" actors considered"
    print "DATA/title.principals.txt cleaned"

In [9]:
###################################
# key to actor name file
###################################

if not startFromCleanData:
    path = "DATA/name.basics.txt"
    cleanPath = "DATA/name.basics.clean.txt"
    files = io.open(path, mode="r", encoding="utf-8")
    cleanfile = io.open(cleanPath, mode="w", encoding="utf-8")
    count = 0
    i=False
    for row in files:
        if i:
            split = row.split("\t")
            key = split[0]
            if actorDict[key]:
                cleanfile.write(row)
        else:
            i=True

    files.close()
    cleanfile.close()
    print "DATA/name.basics.txt cleaned"

## Clean Data Pre-Processing

In [10]:
############################################
# Preprocess Movie Dict and get movie years
############################################

movieAgeDict = {}

path = "DATA/title.basics.clean.txt"
files = io.open(path, mode="r", encoding="utf-8")
count =0
for row in files:
    split=row.split("\t")
    key = split[0]
    if movieDict[key]:
        if (split[1] in ['movie', 'tvMovie']) and not (split[5] == "\\N"):
            movieAgeDict[key] = int(split[5])
            count +=1
files.close()

#Clean Movie dict
for i in range(lastMovie):
    movieDict[idToString(i+1,"tt")] = False

for key in movieAgeDict.keys():
    movieDict[key]=True


print "There are "+str(count)+" movies considered"
print "Movie Dictionnary Preprocessed and Movie Age Dictionnary Built"
    

There are 10735 movies considered
Movie Dictionnary Preprocessed and Movie Age Dictionnary Built


In [11]:
##########################################################
# film actors links file : Clean + get actor dictionnary
##########################################################

path = "DATA/title.principals.clean.txt"
files = io.open(path, mode="r", encoding="utf-8")
roleCheckList = ["actor", "actress", "self"] #check if it is an actor
nLinks = 0
for row in files:
    split = row.split("\t") 
    key = split[0]
    if movieDict[key]:
        if (split[3] in roleCheckList or split[4] in roleCheckList or split[5] in roleCheckList):
            actorDict[split[2]]=True
            nLinks  +=1

files.close()

###REMOVE ERRORS
actorDict["nm0547707"]=False
actorDict['nm0547707']=False
actorDict['nm0809728']=False
actorDict['nm2442859']=False
actorDict['nm1996613']=False
actorDict['nm0600636']=False
actorDict['nm1824417']=False
actorDict['nm2440192']=False
actorDict['nm1754167']=False

print "There are "+str(nLinks-9)+" actors considered"

print "Actor Dictionnary Preprocessed"
    

There are 43553 actors considered
Actor Dictionnary Preprocessed


In [12]:
###################################
# Create a ratings dict
###################################
ratingDict = {}
path = "DATA/ratings.txt"
files = io.open(path, mode="r", encoding="utf-8")
count = 0
i=False # skip first line
for row in files:
    if i:
        key = row[:9]
        if movieDict[key]:
            split = row.split("\t") 
            ratingDict[key] = float(split[1])
    else:
        i=True

files.close()

In [13]:
###################################
# Create a movie name dict
###################################
movieNameDict = {}
moviesList = []
path = "DATA/title.akas.clean.txt"
files = io.open(path, mode="r", encoding="utf-8")
count = 0
for row in files:
    split = row.split("\t") 
    if movieDict[split[0]] and not (split[0] in movieNameDict) and (split[0] in ratingDict) and "original" in row   :
        movieNameDict[split[0]] = split[2]
        moviesList.append(split[0])

files.close()

In [14]:
###################################
# Create an actor name dict
###################################
actorNameDict = {}
actorGenderDict = {}
actorsList = []
path = "DATA/name.basics.clean.txt"
files = io.open(path, mode="r", encoding="utf-8")
count = 0
for row in files:
    split = row.split("\t") 
    if actorDict[split[0]] and not (split[0] in actorNameDict):
        actorNameDict[split[0]] = split[1]
        if "actor" in split[4]:
            actorGenderDict[split[0]] = "M"
        else:
            actorGenderDict[split[0]] = "F"
        actorsList.append(split[0])
files.close()

In [20]:
###################################
# Build a movie data frame
###################################
if not fastExecution:
    moviesData = {"iD" : movieNameDict.keys(), "Title": pd.Series(np.zeros(len(moviesList))), "Rating":pd.Series(np.zeros(len(moviesList))), "Year":pd.Series(np.zeros(len(moviesList)))}
    moviesDF = pd.DataFrame(moviesData)
    for i in moviesDF.index:
        iD =moviesDF.loc[i].at["iD"]
        moviesDF.loc[i, "Title"]= movieNameDict[iD]
        moviesDF.loc[i, "Rating"] = ratingDict[iD]
        moviesDF.loc[i, "Year"]= movieAgeDict[iD]
    if savingFigures:
        moviesDF.to_pickle("obj/moviesDF.pkl")
else:
    moviesDF = pd.read_pickle("obj/moviesDF.pkl")
moviesDF.sort_values("Rating", ascending=False).head(10)

Unnamed: 0,Rating,Title,Year,iD
8686,9.1,The Regard of Flight,1983.0,tt0134050
7737,9.0,Notre-Dame de Paris,1999.0,tt0285800
8377,8.9,Ko to tamo peva,1980.0,tt0076276
4887,8.9,12 Angry Men,1957.0,tt0050083
9860,8.9,Schindler's List,1993.0,tt0108052
1157,8.9,The Lord of the Rings: The Return of the King,2003.0,tt0167260
8305,8.8,Saban Oglu Saban,1977.0,tt0253614
1389,8.8,Sobache serdtse,1988.0,tt0096126
2151,8.8,The Art of Amália,2000.0,tt0204839
9079,8.8,The Lord of the Rings: The Fellowship of the Ring,2001.0,tt0120737


In [22]:
###################################
# Build an actor data frame
###################################
if not fastExecution:
    actorsData = {"iD": actorNameDict.keys(), "Name": pd.Series(np.zeros(len(actorsList))),"Gender": pd.Series(np.zeros(len(actorsList)))}
    actorsDF = pd.DataFrame(actorsData)
    for i in actorsDF.index:
        iD = actorsDF.loc[i].at["iD"]
        actorsDF.loc[i, "Name"]= actorNameDict[iD]
        actorsDF.loc[i, "Gender"] = actorGenderDict[iD]
    if savingFigures:
        actorsDF.to_pickle("obj/actorsDF.pkl")
else:
    actorsDF = pd.read_pickle("obj/actorsDF.pkl")
actorsDF.head(10)

Unnamed: 0,Gender,Name,iD
0,F,Bobbie Bresee,nm0107679
1,F,Malgorzata Rozniatowska,nm0747647
2,M,Ahmet Ugurlu,nm0880128
3,F,Laura Nativo,nm1137466
4,F,Jordy Benattar,nm0070237
5,M,Özkan Ugur,nm0880126
6,M,John Foss,nm1458561
7,M,Panayiotis Hartomatzidis,nm0367186
8,M,Simon Abkarian,nm0008787
9,F,Victoria Snow,nm0795281


In [17]:
###################################
# Create a links list
###################################
path = "DATA/title.principals.clean.txt"
files = io.open(path, mode="r", encoding="utf-8")
links = np.empty((nLinks,2),dtype=object)
count = 0
for row in files:
    split = row.split("\t")
    if actorDict[split[2]]:
        links[count,0]= split[0]
        links[count,1]= split[2]
        count+=1

files.close()

In [23]:
###################################
# Create an actor links list
###################################
actorsLinks = []
files = io.open("obj/actorsLinksList.txt", mode="w", encoding="utf-8")
for i in range(count-1):
    j = i+1
    while (j<count) and (links[i,0]==links[j,0]):
        actorsLinks.append([links[i,1],links[j,1],links[i,0]]) #[actor1, actor2, movie]
        files.write(str(links[i,1])+"\t"+str(links[j,1])+"\t"+links[i,0]+"\r\n")
        j+=1
files.close()

## LOAD & CLEAN DATA FUNCTION 

In [25]:
def cleanLoadData():
    mDF = pd.read_pickle("obj/moviesDF.pkl")
    aDF = pd.read_pickle("obj/actorsDF.pkl")
    aLL = []
    files = io.open("obj/actorsLinksList.txt", mode="r", encoding="utf-8")
    for row in files:
        split = row.split("\t")
        aLL.append(split)
    files.close()
    return movieDict,mDF,aDF,aLL
    