In [1]:
import pandas as pd
import numpy as np
import regex as re
from collections import defaultdict
import heapq

### Import and clean up Kaggle data. Movies which have duplicated title and release year are dropped. This set will include movies whose format has been messed up.

In [2]:
movie_plots = pd.read_csv("Data/wiki_movie_plots_deduped.csv.zip")
# keep just one copy of different movies w/ same URL
to_drop = movie_plots.loc[movie_plots.duplicated(subset=['Wiki Page'])]
movie_plots.drop(index=to_drop.index,inplace=True)
# get rid of all copies of movies which share a title & release year (seems to be due to bad scraping)
to_drop = movie_plots.loc[movie_plots.duplicated(subset=['Title','Release Year'],keep=False)]
movie_plots.drop(index=to_drop.index,inplace=True)

In [3]:
movie_plots

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...,...
34880,2014,Sivas,Turkish,Director: Kaan Müjdeci,Director: Kaan Müjdeci\r\nCast: Dogan Izci,unknown,https://en.wikipedia.org/wiki/Sivas_(film),The film follows an eleven-year-old boy named ...
34882,2017,Çalgı Çengi İkimiz,Turkish,Selçuk Aydemir,"Ahmet Kural, Murat Cemcir",comedy,https://en.wikipedia.org/wiki/%C3%87alg%C4%B1_...,"Two musicians, Salih and Gürkan, described the..."
34883,2017,Olanlar Oldu,Turkish,Hakan Algül,"Ata Demirer, Tuvana Türkay, Ülkü Duru",comedy,https://en.wikipedia.org/wiki/Olanlar_Oldu,"Zafer, a sailor living with his mother Döndü i..."
34884,2017,Non-Transferable,Turkish,Brendan Bradley,"YouTubers Shanna Malcolm, Shira Lazar, Sara Fl...",romantic comedy,https://en.wikipedia.org/wiki/Non-Transferable...,The film centres around a young woman named Am...


### Necessary functions for cleaning up genre.

In [4]:
# all added genres will have no spaces (e.g., science-fiction, historical-fiction, coming-of-age,...)
tags = { # these are main genres to look for even if appear as a subword, 
         # e.g., "romantic comedy" gets "romance" & "comedy")
        "romantic":"romance", "romance":"romance", "love":"romance", "comedy":"comedy", "drama":"drama", 
        "horror":"horror", "thriller":"thriller", 
        "fantasy":"fantasy", "science fiction":"science-fiction", "sci-fi":"science-fiction", 
        "epic":"epic", "action":"action", "adventure":"adventure", 
        "crime":"crime", "mystery":"mystery",
        "western":"western", "historical":"historical-fiction", # a plain "history" genre also exists. unclear if just real history, or mixed w/ historical fiction
        "animated":"animation", "animation":"animation", 
        "family":"family", "coming-of-age":"coming-of-age", "teen":"teen",
        "black-and-white":"black-and-white", "silent":"silent",
        "documentary":"documentary", "biographical":"biographical", "political":"political", "politics":"political",
        "musical":"musical", "opera":"opera", "music":"music",
        "tragedy":"tragedy",
        # now these tags are the main genre for some common sub-genres
        "space":"science-fiction", 
        "fairy tale":"fantasy", "folklore":"fantasy", "mythology":"fantasy",
        "costume drama":"historical-fiction", "period piece":"historical-fiction",
        "propaganda":"political",
        "slasher":"horror", "zombie":"horror", "psycho-biddy":"thriller", "suspense":"thriller",
        "slapstick":"comedy",
        "operetta":"opera", "rock":"music", "hip hops":"music", "concert":"music",
        "whodunit":"mystery", "detective":"mystery",
        "gangster":"crime", "mob":"crime",
        "anime":"animation", "nature":"documentary",
        "superhero":"action", "spy":"action", "heist":"action", "swashbucklers":"adventure",
        "avant-garde":"experimental", "surrealism":"experimental", "art":"experimental"}
# to standardize spellings/phrasings
to_rename = {"crime fiction":"crime", "clay animation":"animation", "historical documentaries":"documentary",
             "animated cartoon":"animation", "coming of age":"coming-of-age",
             "biopic":"biographical", "biography":"biographical", "bio-pic":"biographical", "biographic":"biographical"}

def get_super_genres(genre_str):
    """ Returns a set of (super) genres associated to genre_str
    
    PARAMETERS:
    genre_str: a string describing a genre"""
    ans = set()
    genre = genre_str.lower()
    if 'comedy-drama' in genre:
        ans.add('comedy')
        ans.add('drama')
    if "rockumentary" in genre:
        ans.add("music")
        ans.add("rock")
        ans.add("documentary")
    if "tragicomedy" in genre:
        ans.add("tragedy")
        ans.add("comedy")
    if "rom-com" in genre:
        ans.add("romance")
        ans.add("comedy")
    if "dramedy" in genre:
        ans.add("drama")
        ans.add("comedy")
    # wiki may have many genres separated by spaces
    for substr in to_rename:
        if substr in genre:
            ans.add(to_rename[substr])
    for substr in tags:
        if substr in genre:
            ans.add(tags[substr])
    if genre != '':
        ans.update(set(genre.split(" ")))
    return ans

def clean_imdb_genre(arr):
    """ Returns a set of genres corresponding to the original (sub)genres in arr, but with more standardized
    names & with larger genre categories added if necessary.
    
    PARAMETERS:
    arr: a list of genres, as originally formatted (and split into a list) by the CMU dataset
    """
    ans = set()
    for s in arr:
        #remove ids and clean up formatting of each genre
        new_s = re.sub(' .ilm|\smovie','',re.sub('"/m/.*": ','',s))
        new_s = re.sub('"','',new_s)
        if new_s != '' and new_s[0] == ' ':
            new_s = new_s[1:]
            
        genres = new_s.split('/')
        # split phrase into list of coarser genres
        for genre in genres:
            ans.update(get_super_genres(genre))
    return ans

def clean_wiki_genre(wiki_genres):
    """ Returns a set of genres corresponding to the IMDB-standardized (super) genres for those 
    appearing in wiki_genres.
    
    PARAMETERS:
    wiki_genre: string which represents the genres assigned by wikipedia
    """
    # Films with multiple genres can be separated by slashes (/), 
    # OR by - w/ spaces (e.g. “horror - thriller”, as opposed to “sci-fi”), 
    # OR by commas
    genre_list = wiki_genres.replace("/",",").replace(" - ",",").split(",")
    genre_list = list(map(lambda s : s.strip().lower(), genre_list)) # tidy case & extra whitespace
    ans = set()
    # could also be separated by spaces, handled via super_genres search
    for genre in genre_list:
        ans.update(get_super_genres(genre))
    return ans

def find_duplicates(x):
    """ Returns True iff the movie represented by x is a duplicate of some movie in movie_plots
    
    PARAMETERS:
    x: a series which represents a "row" in some kind of movie DataFrame
    """
    is_title_dupl = (x.Title in movie_plots.Title.values)
    is_year_dupl = (x['Release Year'] in movie_plots.loc[movie_plots.Title == x.Title,"Release Year"].values)
    return bool( is_title_dupl and is_year_dupl)

### Import and clean up cmu data. Movies with duplicate title and release year are dropped. 

In [5]:
# import data & adjust labels & types
cmu_data = pd.read_csv("Data/cmu_movie_boxoffice.tsv",sep="\t",header=None)
cmu_data.drop(columns=[0,1,4,5,6,7],inplace=True)
cmu_data.rename(columns={2:"Title", 3:"Release Year",8:"Genre"},inplace=True)
cmu_data["Release Year"] = cmu_data["Release Year"].str.replace(r'-.*','',regex = True)
cmu_data = cmu_data.dropna()
cmu_data["Release Year"] = cmu_data["Release Year"].astype('int64')

# split & clean genres
cmu_data.Genre = cmu_data.Genre.apply(lambda s:s[1:len(s)-1])
cmu_data.Genre = cmu_data.Genre.apply(lambda x:x.split(','))
cmu_data.Genre = cmu_data.Genre.apply(lambda x: clean_imdb_genre(x))

# get rid of all copies of movies which share a title & release year (seems to be due to bad scraping)
cmu_dupl = cmu_data[cmu_data.duplicated(subset=['Title','Release Year'],keep=False)]
duplicates = cmu_dupl[cmu_dupl.apply(find_duplicates, axis=1)] # functions are 1st class objects, lambda optional
cmu_data.drop(index=duplicates.index,inplace=True)


In [6]:
# view genre counts
imdb_genres = defaultdict(int) #dictionary of counts with all IMDB GENRES
for movie in cmu_data.Genre.values:
    for genre in movie:
        imdb_genres[genre]+=1  
imdb_genres


defaultdict(int,
            {'supernatural': 635,
             'western': 2232,
             'thriller': 8946,
             'science': 2920,
             'action': 9836,
             'adventure': 8445,
             'science-fiction': 2905,
             'horror': 5048,
             'space': 17,
             'fiction': 10164,
             'crime': 7820,
             'biographical': 2390,
             'mystery': 3153,
             'drama': 33148,
             'erotic': 340,
             'psychological': 1315,
             'silent': 5050,
             'indie': 6769,
             'comedy': 20270,
             'short': 7201,
             'black-and-white': 8808,
             'cinema': 7572,
             'fantasy': 2872,
             'world': 7163,
             'family': 5137,
             'musical': 4106,
             'music': 5201,
             'japanese': 2083,
             'movies': 3739,
             'ensemble': 447,
             'romance': 10599,
             'comedy-drama': 1678,
    

In [7]:
# view n most popular genres
minHeap = []
n = 50
for key in imdb_genres:
    heapq.heappush(minHeap,[imdb_genres[key],key])
    if len(minHeap)>n:
        heapq.heappop(minHeap)
minHeap.sort()
minHeap

[[939, 'history'],
 [991, 'parody'],
 [1022, 'black'],
 [1047, 'television'],
 [1139, 'sports'],
 [1146, 'lgbt'],
 [1238, "children's"],
 [1314, 'bollywood'],
 [1315, 'psychological'],
 [1362, 'biography'],
 [1370, 'chinese'],
 [1462, 'of'],
 [1526, 'adaptation'],
 [1536, 'political'],
 [1678, 'comedy-drama'],
 [1739, 'piece'],
 [1741, 'period'],
 [1805, 'experimental'],
 [2083, 'japanese'],
 [2232, 'western'],
 [2257, 'film'],
 [2390, 'biographical'],
 [2540, 'war'],
 [2746, 'historical-fiction'],
 [2872, 'fantasy'],
 [2905, 'science-fiction'],
 [2920, 'science'],
 [3153, 'mystery'],
 [3246, 'animation'],
 [3739, 'movies'],
 [4106, 'musical'],
 [5038, 'documentary'],
 [5048, 'horror'],
 [5050, 'silent'],
 [5137, 'family'],
 [5201, 'music'],
 [5435, 'romantic'],
 [6769, 'indie'],
 [7163, 'world'],
 [7201, 'short'],
 [7572, 'cinema'],
 [7820, 'crime'],
 [8445, 'adventure'],
 [8808, 'black-and-white'],
 [8946, 'thriller'],
 [9836, 'action'],
 [10164, 'fiction'],
 [10599, 'romance'],
 [20

In [8]:
# view movies that don't have any of the n most popular genres
pop_gens = [it[1] for it in minHeap]
def has_overlap(row):
    overlap = False
    for genre in pop_gens:
        overlap = overlap or (genre in row["Genre"])
    return(overlap)
filt = [not has_overlap(cmu_data.iloc[i]) for i in range(len(cmu_data))]
cmu_data.loc[ filt ]

Unnamed: 0,Title,Release Year,Genre
18,Die Fahne von Kriwoj Rog,1967,{}
32,Emilia Galotti,1958,{}
42,Vinayaka Geleyara Balaga,2011,{}
50,Behind The Player: John 5,2008,{}
93,Vixen!,1968,"{porn, sexploitation, softcore}"
...,...,...,...
81581,"Ne daj se, Floki",2000,{}
81681,Aachariyangal,2012,{}
81693,Fiete im Netz,1958,{}
81706,Fierro a fondo,1952,{}


In [9]:
# view the genres of these "uncommon" genre movies
small_genres = defaultdict(int) #dictionary of counts with leftover IMDB GENRES
for movie in cmu_data.loc[filt].Genre.values:
    for genre in movie:
        small_genres[genre]+=1  
small_genres

defaultdict(int,
            {'porn': 6,
             'sexploitation': 13,
             'softcore': 6,
             'pornographic': 131,
             'dogme': 3,
             '95': 3,
             'gay': 52,
             'pornography': 61,
             'satire': 6,
             'dance': 5,
             'blaxploitation': 4,
             'educational': 4,
             'religious': 9,
             'social': 5,
             '&': 13,
             'society': 12,
             'culture': 12,
             'issues': 3,
             'revenge': 1,
             'adult': 54,
             'hardcore': 6,
             'innovations': 1,
             'inventions': 1,
             'women': 1,
             'in': 1,
             'prisons': 1,
             'erotica': 23,
             'christmas': 5,
             'remake': 1,
             'pre-code': 6,
             'auto': 2,
             'racing': 2,
             'stop': 4,
             'motion': 4,
             'mockumentary': 8,
             'mondo': 1,
 

### Merge kaggle and cmu data.

In [10]:
data = movie_plots.merge(cmu_data, how = 'left', on = ['Title', 'Release Year'])
data.rename(columns={'Genre_y':'Genre'},inplace=True)
data.Genre.fillna(0,inplace=True)
def helper(x):
    if x==0:
        return set()
    else:
        return x
data.Genre = data.Genre.apply(helper)

In [11]:
data.sample(20)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre_x,Wiki Page,Plot,Genre
18856,1951,One Wild Oat,British,Charles Saunders,"Robertson Hare, Stanley Holloway, Sam Costa",comedy,https://en.wikipedia.org/wiki/One_Wild_Oat,A barrister (Robertson Hare) attempts to disco...,"{comedy, domestic}"
12351,1995,Dream a Little Dream 2,American,James Lemmo,"Corey Feldman, Corey Haim",comedy,https://en.wikipedia.org/wiki/Dream_a_Little_D...,The pair of Bobby Keller (Corey Feldman) and D...,"{indie, comedy, thriller, fantasy, sex, drama}"
22501,2017,Love Contractually,Chinese,Liu Guonan,"Sammi Cheng, Joseph Chang, Lam Suet",comedy / romance,https://en.wikipedia.org/wiki/Love_Contractually,"Bo is the new assistant to Jin, a beautiful CE...",{}
33699,2013,Snowpiercer,South_Korean,Bong Joon-ho,"Chris Evans, Song Kang-ho",unknown,https://en.wikipedia.org/wiki/Snowpiercer,"In 2014, an attempt to counteract global warmi...",{}
26724,1989,Tarka,Kannada,Sunil Kumar Desai,"Shankar Nag, Vanitha Vasu, Devaraj, Avinash",unknown,https://en.wikipedia.org/wiki/Tarka_(movie),"The film opens to Akshay (Shankar Nag), having...",{}
18439,1940,Tilly of Bloomsbury,British,Leslie S. Hiscot,"Sydney Howard, Jean Gillie",comedy,https://en.wikipedia.org/wiki/Tilly_of_Bloomsb...,The rich and wealthy aristocrat socialité bach...,{comedy}
23043,1997,"All's Well, Ends Well 1997",Hong Kong,Alfred Cheung,"Stephen Chow, Raymond Wong Pak-ming",comedy,"https://en.wikipedia.org/wiki/All%27s_Well,_En...",Kung (Stephen Chow) is the spoiled youngest br...,{}
19011,1954,Front Page Story,British,Gordon Parry,"Jack Hawkins, Eva Bartok",drama,https://en.wikipedia.org/wiki/Front_Page_Story,Grant is a hard working Fleet Street newspaper...,{drama}
32171,1972,Female Prisoner #701: Scorpion,Japanese,"Itō, ShunyaShunya Itō","Meiko Kaji, Isao Natsuyagi, Fumio Watanabe",unknown,https://en.wikipedia.org/wiki/Female_Prisoner_...,Nami Matsushima (Meiko Kaji) is set up by her ...,"{crime, prisons, thriller, japanese, action, a..."
141,1916,Hulda from Holland,American,John B. O'Brien,"Mary Pickford, Frank Losee",drama,https://en.wikipedia.org/wiki/Hulda_from_Holland,"Upon the death of her parents, little Hulda fi...",{}


### Incorporate genres from Kaggle data

In [12]:
#movie_plots.Genre = movie_plots.Genre.apply(lambda x:re.split('/|,',x))
#go through and add genre_x genres if appropriate
for i in range(len(data)):
    wiki_genres = data.loc[i,"Genre_x"]
    genres = data.loc[i,"Genre"] 
    if wiki_genres != 'unknown':
        genres.update(clean_wiki_genre(wiki_genres))
#        for val in imdb_genres: 
#            if val in wiki_genres.lower():
#                genres.add(val)

# NOTE: there WILL be extraneous "useless" genres in the genre list. We should only choose the top n.

In [13]:
data.drop(columns=['Genre_x'],inplace=True)

In [None]:
data.info()

In [None]:
# view movies that don't have any of the n most popular genres
final_filt = [not has_overlap(data.iloc[i]) for i in range(len(data))]
data.loc[ final_filt ]

In [None]:
# view the genres of these "uncommon" genre movies
final_small_genres = defaultdict(int) #dictionary of counts with leftover IMDB GENRES
for movie in data.loc[final_filt].Genre.values:
    for genre in movie:
        final_small_genres[genre]+=1  
final_small_genres

In [15]:
data.to_csv('wiki_plots_with_genres.csv.zip')

### Helper functions.

In [None]:
def search_by_pattern(arr,pattern): #looks through arr to find genres that contain the given pattern
    """ Given a list arr, return all entries which contain pattern as a substring.
    
    PARAMETERS:
    arr: a list (or other iterable container) of strings
    pattern: a string
    """
    ans = []
    for val in arr:
        if re.search(pattern,val):
            ans.append(val)
    return ans
#Example: search_by_pattern(imdb_genres,"anim") 

In [None]:
search_by_pattern(imdb_genres,"short")

In [None]:
#make genre predictor/associated in the model?