In [1]:
import pandas as pd
import numpy as np
import regex as re
from collections import defaultdict
import heapq

### Import and clean up Kaggle data. Movies which have duplicated title and release year are dropped. This set will include movies whose format has been messed up.

In [3]:
movie_plots = pd.read_csv("Data/wiki_movie_plots_deduped.csv.zip")
to_drop = movie_plots.loc[movie_plots.duplicated(subset=['Wiki Page'])]
movie_plots.drop(index=to_drop.index,inplace=True)
to_drop = movie_plots.loc[movie_plots.duplicated(subset=['Title','Release Year'],keep=False)]
movie_plots.drop(index=to_drop.index,inplace=True)

### Necessary functions for cleaning up genre.

In [4]:
# all added genres will have no spaces (e.g., science-fiction, historical-fiction, coming-of-age,...)
tags = { # these are main genres to look for even if appear as a subword, 
         # e.g., "romantic comedy" gets "romance" & "comedy")
        "romantic":"romance", "comedy":"comedy", "drama":"drama", 
        "horror":"horror", "thriller":"thriller", 
        "fantasy":"fantasy", "science fiction":"science-fiction", "sci-fi":"science-fiction", 
        "epic":"epic", "action":"action", "adventure":"adventure", 
        "crime":"crime", "mystery":"mystery",
        "western":"western", "historical":"historical-fiction", 
        "animated":"animation", "animation":"animation", 
        "family":"family", "coming-of-age":"coming-of-age", "teen":"teen",
        "black-and-white":"black-and-white", "silent":"silent",
        "documentary":"documentary", "biographical":"biographical", "political":"political",
        "musical":"musical", "opera":"opera", "music":"music",
        "tragedy":"tragedy",
        # now these tags are the main genre for some common sub-genres
        "space":"science-fiction", 
        "costume drama":"historical-fiction", "period piece":"historical-fiction",
        "propaganda":"political",
        "slasher":"horror", "zombie":"horror", "psycho-biddy":"thriller",
        "splapstick":"comedy",
        "operetta":"opera", "rock":"music", "hip hops":"music", "concert":"music",
        "whodunit":"mystery", "detective":"mystery", 
        "anime":"animation", "nature":"documentary",
        "superhero":"action", "spy":"action", "swashbucklers":"adventure",
        "avant-garde":"experimental", "surrealism":"experimental", "art":"experimental"}
# to standardize spellings/phrasings
to_rename = {"crime fiction":"crime", "clay animation":"animation", "historical documentaries":"documentary",
             "animated cartoon":"animation", "coming of age":"coming-of-age",
             "biopic":"biographical", "biography":"biographical"}
def clean_genre(arr):
    """ Returns a set of genres corresponding to the original (sub)genres in arr, but with more standardized
    names & with larger genre categories added if necessary.
    
    PARAMETERS:
    arr: a list of genres, as originally formatted (and split into a list) by the CMU dataset
    """
    ans = set()
    for s in arr:
        #remove ids and clean up formatting of each genre
        new_s = re.sub(' .ilm|\smovie','',re.sub('"/m/.*": ','',s))
        new_s = re.sub('"','',new_s)
        if new_s != '' and new_s[0] == ' ':
            new_s = new_s[1:]
            
        #split phrase into list of coarser genres
        genres = new_s.split('/')
        for genre in genres:
            genre = genre.lower()
            if genre == 'comedy-drama':
                ans.add('comedy')
                ans.add('drama')
            elif genre == "rockumentary":
                ans.add("music")
                ans.add("rock")
                ans.add("documentary")
            elif genre == "tragicomedy":
                ans.add("tragedy")
                ans.add("comedy")
            else:
                if genre in to_rename:
                    ans.add(to_rename[genre])
                else:
                    for substr in tags:
                        if substr in genre:
                            ans.add(tags[substr])
                    if genre != '':
                        ans.add(genre)
    return ans


def find_duplicates(x):
    """ Returns True iff the movie represented by x is a duplicate of some movie in movie_plots
    
    PARAMETERS:
    x: a series which represents a "row" in some kind of movie DataFrame
    """
    is_title_dupl = (x.Title in movie_plots.Title.values)
    is_year_dupl = (x['Release Year'] in movie_plots.loc[movie_plots.Title == x.Title,"Release Year"].values)
    return bool( is_title_dupl and is_year_dupl)

### Import and clean up cmu data. Movies with duplicate title and release year are dropped. 

In [5]:
cmu_data = pd.read_csv("Data/cmu_movie_boxoffice.tsv",sep="\t",header=None)
cmu_data.drop(columns=[0,1,4,5,6,7],inplace=True)
cmu_data.rename(columns={2:"Title", 3:"Release Year",8:"Genre"},inplace=True)
cmu_data["Release Year"] = cmu_data["Release Year"].str.replace(r'-.*','',regex = True)
cmu_data = cmu_data.dropna()
cmu_data["Release Year"] = cmu_data["Release Year"].astype('int64')

cmu_data.Genre = cmu_data.Genre.apply(lambda s:s[1:len(s)-1])
cmu_data.Genre = cmu_data.Genre.apply(lambda x:x.split(','))
cmu_data.Genre = cmu_data.Genre.apply(lambda x: clean_genre(x))

cmu_dupl = cmu_data[cmu_data.duplicated(subset=['Title','Release Year'],keep=False)]
duplicates = cmu_dupl[cmu_dupl.apply(find_duplicates, axis=1)] # functions are 1st class objects, lambda optional
cmu_data.drop(index=duplicates.index,inplace=True)


In [6]:
imdb_genres = defaultdict(int) #dictionary of counts with all IMDB GENRES
for movie in cmu_data.Genre.values:
    for genre in movie:
        imdb_genres[genre]+=1  

minHeap = []
n = 50
for key in imdb_genres:
    heapq.heappush(minHeap,[imdb_genres[key],key])
    if len(minHeap)>n:
        heapq.heappop(minHeap)
minHeap.sort() #returns top n most popular genres

pop_gens = [it[1] for it in minHeap]
minHeap
def has_overlap(row):
    overlap = False
    for genre in pop_gens:
        overlap = overlap or (genre in row["Genre"])
    return(overlap)
filt = [not has_overlap(cmu_data.iloc[i]) for i in range(len(cmu_data))]
cmu_data.loc[ filt ]

Unnamed: 0,Title,Release Year,Genre
18,Die Fahne von Kriwoj Rog,1967,{}
32,Emilia Galotti,1958,{}
42,Vinayaka Geleyara Balaga,2011,{}
50,Behind The Player: John 5,2008,{}
93,Vixen!,1968,"{softcore porn, sexploitation}"
...,...,...,...
81581,"Ne daj se, Floki",2000,{}
81681,Aachariyangal,2012,{}
81693,Fiete im Netz,1958,{}
81706,Fierro a fondo,1952,{}


In [7]:
small_genres = defaultdict(int) #dictionary of counts with all IMDB GENRES
for movie in cmu_data.loc[filt].Genre.values:
    for genre in movie:
        small_genres[genre]+=1  
small_genres

defaultdict(int,
            {'softcore porn': 6,
             'sexploitation': 13,
             'film noir': 28,
             'pornographic': 131,
             'filipino movies': 52,
             'dogme 95': 3,
             'gay pornography': 52,
             'satire': 6,
             'dance': 5,
             'blaxploitation': 4,
             'educational': 4,
             'religious': 9,
             'social issues': 3,
             'culture & society': 12,
             'revenge': 1,
             'pornography': 4,
             'adult': 54,
             'hardcore pornography': 6,
             'inventions & innovations': 1,
             'suspense': 4,
             'women in prisons': 1,
             'erotica': 24,
             'christmas': 5,
             'remake': 1,
             'pre-code': 6,
             'auto racing': 2,
             'stop motion': 4,
             'mockumentary': 8,
             'mondo': 1,
             'christian': 3,
             'reboot': 1,
             'tolly

In [8]:
minHeap

[[757, 'cult'],
 [781, 'political drama'],
 [864, 'history'],
 [889, 'martial arts'],
 [901, 'coming-of-age'],
 [916, 'teen'],
 [968, 'television'],
 [991, 'parody'],
 [1017, 'black comedy'],
 [1026, 'family drama'],
 [1138, 'sports'],
 [1146, 'lgbt'],
 [1209, "children's"],
 [1312, 'psychological thriller'],
 [1314, 'bollywood'],
 [1370, 'chinese movies'],
 [1526, 'film adaptation'],
 [1536, 'political'],
 [1694, 'experimental'],
 [1739, 'period piece'],
 [2083, 'japanese movies'],
 [2106, 'crime thriller'],
 [2161, 'biographical'],
 [2232, 'western'],
 [2533, 'war'],
 [2631, 'romantic comedy'],
 [2742, 'historical-fiction'],
 [2857, 'fantasy'],
 [2884, 'science fiction'],
 [2905, 'science-fiction'],
 [3153, 'mystery'],
 [3246, 'animation'],
 [3341, 'romantic drama'],
 [4106, 'musical'],
 [5038, 'documentary'],
 [5048, 'horror'],
 [5050, 'silent'],
 [5137, 'family'],
 [5201, 'music'],
 [6769, 'indie'],
 [7144, 'world cinema'],
 [7201, 'short'],
 [7773, 'crime'],
 [8445, 'adventure'],


In [9]:
cmu_data.iloc[filt].sample(20)

Unnamed: 0,Title,Release Year,Genre
48578,The Beloved,1966,{}
53633,Poojapushpam,1969,{}
13796,Charice: One for the Heart,2012,{}
49410,Suburban Secrets,2004,"{adult, pornographic, softcore porn}"
76094,Anveshanam,1972,{}
27215,Savaale Samali,1971,{}
78285,It's Time,2006,{}
65282,Shadow Over The Islands,1952,{}
13978,Slave of Dreams,1995,{christian}
24873,The Life of Nephi,1915,{}


In [10]:
imdb_genres

defaultdict(int,
            {'supernatural': 635,
             'adventure': 8445,
             'space western': 13,
             'science fiction': 2884,
             'science-fiction': 2905,
             'western': 2232,
             'action': 9763,
             'horror': 5048,
             'thriller': 8907,
             'mystery': 3153,
             'crime drama': 552,
             'drama': 33148,
             'crime': 7773,
             'biographical': 2161,
             'psychological thriller': 1312,
             'erotic thriller': 207,
             'comedy': 20268,
             'short': 7201,
             'indie': 6769,
             'black-and-white': 8808,
             'silent': 5050,
             'family': 5137,
             'fantasy': 2857,
             'world cinema': 7144,
             'musical': 4106,
             'music': 5201,
             'japanese movies': 2083,
             'romantic comedy': 2631,
             'romantic drama': 3341,
             'romance': 10599,
  

### Merge kaggle and cmu data.

In [11]:
data = movie_plots.merge(cmu_data, how = 'left', on = ['Title', 'Release Year'])
data.rename(columns={'Genre_y':'Genre'},inplace=True)
data.Genre.fillna(0,inplace=True)
def helper(x):
    if x==0:
        return set()
    else:
        return x
data.Genre = data.Genre.apply(helper)

In [12]:
data.sample(20)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre_x,Wiki Page,Plot,Genre
10070,1983,Revenge of the Ninja,American,Sam Firstenberg,"Sho Kosugi, Arthur Roberts",action,https://en.wikipedia.org/wiki/Revenge_of_the_N...,"In Japan, the home of Cho Osaki is attacked by...","{adventure, drama, action thrillers, indie, ex..."
16948,2016,Race,American,Stephen Hopkins,Stephan James\r\nJason Sudeikis\r\nJeremy Irons,sports drama,https://en.wikipedia.org/wiki/Race_(2016_film),"Jesse Owens, a promising black American runner...",{}
21612,1995,Skin Deep,Canadian,Midi Onodera,"Melanie Nicholls-King, Keram Malicki-Sánchez",drama co-produced with the national film board,https://en.wikipedia.org/wiki/Skin_Deep_(1995_...,Skin Deep tells the story of an award-winning ...,"{drama, lgbt}"
22441,2016,Mr. High Heels,Chinese,Lu Ke,"Du Jiang, Fiona Sit, Yu Xintian, Chen Xuedong,...",comedy / romance,https://en.wikipedia.org/wiki/Mr._High_Heels,"Hang Yuan, a man who's been in love with a fri...",{}
19744,1967,The Viking Queen,British,Don Chaffey,"Don Murray, Carita",historical,https://en.wikipedia.org/wiki/The_Viking_Queen,"According to her father's wishes, Queen Salina...",{}
7728,1963,Pent-House Mouse,American,Chuck Jones,,animated,https://en.wikipedia.org/wiki/Pent-House_Mouse,"Tom is relaxing in a penthouse, while Jerry is...",{}
18815,1950,Tony Draws a Horse,British,John Paddy Carstairs,"Cecil Parker, Anne Crawford",comedy,https://en.wikipedia.org/wiki/Tony_Draws_a_Horse,When their eight-year-old son Tony (Anthony La...,"{comedy, drama}"
20005,1973,The Hireling,British,Alan Bridges,"Robert Shaw, Sarah Miles",drama,https://en.wikipedia.org/wiki/The_Hireling,"Set in and around Bath, Somerset, immediately ...","{romance, drama, romantic drama}"
12489,1995,The Stars Fell on Henrietta,American,James Keach,"Robert Duvall, Aidan Quinn, Brian Dennehy",drama,https://en.wikipedia.org/wiki/The_Stars_Fell_o...,The setting is early America during the oil bo...,"{americana, drama}"
11066,1989,Back to the Future Part II,American,Robert Zemeckis,"Michael J. Fox, Christopher Lloyd, Lea Thompson","science fiction, comedy",https://en.wikipedia.org/wiki/Back_to_the_Futu...,"On October 26, 1985, Dr. Emmett Brown arrives ...","{time travel, family, comedy, adventure, humou..."


### Incorporate genres from Kaggle data

In [13]:
#movie_plots.Genre = movie_plots.Genre.apply(lambda x:re.split('/|,',x))
#go through and add genre_x genres to genre only if genre_x is an imdb genre 
for i in range(len(data)):
    wiki_genres = data.loc[i,"Genre_x"]
    genres = data.loc[i,"Genre"] 
    if wiki_genres != 'unknown':
        for val in imdb_genres: 
            if val in wiki_genres.lower():
                genres.add(val)


In [14]:
data.drop(columns=['Genre_x'],inplace=True)

In [15]:
data.to_csv('Data/wiki_plots_with_genres.csv.zip', compression="zip")

### Helper functions.

In [22]:
def search_by_pattern(arr,pattern): #looks through arr to find genres that contain the given pattern
    """ Given a list arr, return all entries which contain pattern as a substring.
    
    PARAMETERS:
    arr: a list (or other iterable container) of strings
    pattern: a string
    """
    ans = []
    for val in arr:
        if re.search(pattern,val):
            ans.append(val)
    return ans
#Example: search_by_pattern(imdb_genres,"anim") 

In [140]:
search_by_pattern(imdb_genres,"short")

['short']

In [14]:
type(cmu_dupl.iloc[1].loc["Genre"])

set

In [7]:
find_duplicates(cmu_dupl.iloc[1])

False

In [9]:
type(cmu_dupl.iloc[1])

pandas.core.series.Series

In [None]:
#make genre predictor/associated in the model?