In [805]:
import pandas as pd
import numpy as np
import regex as re
from collections import defaultdict
import heapq

### Import and clean up Kaggle data. Movies which have duplicated title and release year are dropped. This set will include movies whose format has been messed up.

In [784]:
movie_plots = pd.read_csv("../Documents/Erdos Data Science Boot Camp/wiki_movie_plots_deduped.csv")
to_drop = movie_plots.loc[movie_plots.duplicated(subset=['Wiki Page'])]
movie_plots.drop(index=to_drop.index,inplace=True)
to_drop = movie_plots.loc[movie_plots.duplicated(subset=['Title','Release Year'],keep=False)]
movie_plots.drop(index=to_drop.index,inplace=True)

### Necessary functions for cleaning up genre.

In [820]:
tags = {"romantic":"romance","comedy":"comedy","animation":"animation","horror":"horror","thriller":"thriller",
           "epic":"epic","musical":"musical","fantasy":"fantasy","science fiction":"science fiction",
        "sci-fi":"science fiction","action":"action","adventure":"adventure","drama":"drama","crime":"crime",
        "western":"western","historical":"historical fiction","animated":"animation"}
to_rename = {"crime fiction":"crime","clay animation":"animation","historical documentaries":"documentary",
             "animated cartoon":"animation"}
#political, space, slasher (all horror automatically?), costume drama -->historical fiction
def clean_genre(arr):
    ans = set()
    for s in arr:
        #remove ids and clean up formatting of each genre
        new_s = re.sub(' .ilm|\smovie','',re.sub('"/m/.*": ','',s))
        new_s = re.sub('"','',new_s)
        if new_s != '' and new_s[0] == ' ':
            new_s = new_s[1:]
            
        #split phrase into list of coarser genres
        genres = new_s.split('/')
        for genre in genres:
            genre = genre.lower()
            if genre == 'comedy-drama':
                ans.add('comedy')
                ans.add('drama')
            else:
                if genre in to_rename:
                    ans.add(to_rename[genre])
                else:
                    for substr in tags:
                        if substr in genre:
                            ans.add(tags[substr])
                    if genre != '':
                        ans.add(genre)
    return ans
def find_duplicates(x):
    return bool((x.Title in movie_plots.Title.values) and (x['Release Year'] in movie_plots.loc[movie_plots.Title == x.Title,"Release Year"].values))

### Import and clean up cmu data. Movies with duplicate title and release year are dropped. 

In [821]:
cmu_data = pd.read_csv("Data/cmu_movie_boxoffice.tsv",sep="\t",header=None)
cmu_data.drop(columns=[0,1,4,5,6,7],inplace=True)
cmu_data.rename(columns={2:"Title", 3:"Release Year",8:"Genre"},inplace=True)
cmu_data["Release Year"] = cmu_data["Release Year"].str.replace(r'-.*','',regex = True)
cmu_data = cmu_data.dropna()
cmu_data["Release Year"] = cmu_data["Release Year"].astype('int64')

cmu_data.Genre = cmu_data.Genre.apply(lambda s:s[1:len(s)-1])
cmu_data.Genre = cmu_data.Genre.apply(lambda x:x.split(','))
cmu_data.Genre = cmu_data.Genre.apply(lambda x: clean_genre(x))

cmu_dupl = cmu_data[cmu_data.duplicated(subset=['Title','Release Year'],keep=False)]
duplicates = cmu_dupl[cmu_dupl.apply(lambda row:find_duplicates(row),axis=1)]
cmu_data.drop(index=duplicates.index,inplace=True)


In [822]:
imdb_genres = defaultdict(int) #dictionary of counts with all IMDB GENRES
for movie in cmu_data.Genre.values:
    for genre in movie:
        imdb_genres[genre]+=1  

minHeap = []
n = 50
for key in imdb_genres:
    heapq.heappush(minHeap,[imdb_genres[key],key])
    if len(minHeap)>n:
        heapq.heappop(minHeap)
minHeap.sort() #returns top n most popular genres

In [828]:
imdb_genres

defaultdict(int,
            {'adventure': 8432,
             'western': 2232,
             'supernatural': 635,
             'action': 9557,
             'thriller': 8904,
             'space western': 13,
             'science fiction': 2904,
             'horror': 5003,
             'drama': 33148,
             'mystery': 3053,
             'crime drama': 552,
             'crime': 7773,
             'biographical': 1099,
             'psychological thriller': 1312,
             'erotic thriller': 207,
             'indie': 6769,
             'silent': 5050,
             'short': 7201,
             'black-and-white': 8808,
             'comedy': 20268,
             'world cinema': 7144,
             'fantasy': 2857,
             'family': 4419,
             'musical': 4106,
             'japanese movies': 2083,
             'romantic comedy': 2631,
             'romantic drama': 3341,
             'ensemble': 447,
             'romance': 10599,
             'film adaptation': 1526,


### Merge kaggle and cmu data.

In [825]:
data = movie_plots.merge(cmu_data, how = 'left', on = ['Title', 'Release Year'])
data.rename(columns={'Genre_y':'Genre'},inplace=True)
data.Genre.fillna(0,inplace=True)
def helper(x):
    if x==0:
        return set()
    else:
        return x
data.Genre = data.Genre.apply(lambda x:helper(x))

In [813]:
data.sample(20)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre_x,Wiki Page,Plot,Genre
25879,2006,Chup Chup Ke,Bollywood,Priyadarshan,"Kareena Kapoor, Shahid Kapoor, Neha Dhupia, Ra...","comedy, romance",https://en.wikipedia.org/wiki/Chup_Chup_Ke,Jeetu (Shahid Kapoor) is constantly getting hi...,"{romantic comedy, world cinema, comedy of erro..."
31978,1950,Scandal,Japanese,Akira Kurosawa,Toshirō Mifune,drama,https://en.wikipedia.org/wiki/Scandal_(1950_film),"Ichiro Aoye (Toshirō Mifune), an artist, meets...","{world cinema, drama, japanese movies, comedy}"
17208,2017,Jeepers Creepers 3,American,Victor Salva,Victor Salva (director/screeplay); Jonathan Br...,horror,https://en.wikipedia.org/wiki/Jeepers_Creepers_3,The film opens with a shuriken flying through ...,{}
23903,1948,Ghar Ki Izzat,Bollywood,Ram Daryani,"Dilip Kumar, Mumtaz Shanti, Jeevan, Gope, Mano...",social family drama,https://en.wikipedia.org/wiki/Ghar_Ki_Izzat_(1...,"Chandra (Dilip Kumar, a young lawyer, lives in...",{action}
20475,1997,Metroland,British,Philip Saville,"Christian Bale, Emily Watson",comedy,https://en.wikipedia.org/wiki/Metroland_(1997_...,"In 1977, Chris (Christian Bale) and Marion (Em...","{indie, drama, marriage drama, romantic drama,..."
26795,2003,Swathi Muthu,Kannada,D. Rajendra Babu,"Sudeep, Meena",unknown,https://en.wikipedia.org/wiki/Swathi_Muthu,"Shivaiah (Sudeep), an autistic, lives in a vil...",{}
25504,1998,Zakhm,Bollywood,Mahesh Bhatt,"Ajay Devgan, Sonali Bendre, Pooja Bhatt, Akkin...",nadeem-shravan,https://en.wikipedia.org/wiki/Zakhm,Music director Ajay (Ajay Devgn) argues with h...,"{bollywood, world cinema, drama}"
4883,1949,Hellfire,American,R.G. Springsteen,"Wild Bill Elliott, Marie Windsor, Forrest Tucker",western,https://en.wikipedia.org/wiki/Hellfire_(1949_f...,Drifting gambler Zeb Smith promises a dying pr...,"{adventure, action, western}"
2139,1936,Thirteen Hours by Air,American,Mitchell Leisen,"Fred MacMurray, Joan Bennett, ZaSu Pitts",drama,https://en.wikipedia.org/wiki/Thirteen_Hours_b...,Airline pilot Jack Gordon (Fred MacMurray) on ...,"{action, drama, disaster}"
30963,1962,Rakta Sambandham,Telugu,V. Madhusudan Rao,"N. T. Rama Rao, Savitri, Kanta Rao, Devika",drama,https://en.wikipedia.org/wiki/Rakta_Sambandham...,Raju (N. T. Rama Rao) & Radha (Savitri) are si...,{}


### Incorporate genres from Kaggle data

In [839]:
#movie_plots.Genre = movie_plots.Genre.apply(lambda x:re.split('/|,',x))
#go through and add genre_x genres to genre only if genre_x is an imdb genre 
for i in range(len(data)):
    wiki_genres = data.loc[i,"Genre_x"]
    genres = data.loc[i,"Genre"] 
    if wiki_genres != 'unknown':
        for val in imdb_genres: 
            if val in wiki_genres.lower():
                genres.add(val)


In [842]:
data.drop(columns=['Genre_x'],inplace=True)

In [844]:
data.to_csv('wiki_plots_with_genres')

### Helper functions.

In [723]:
def search_by_pattern(arr,pattern): #looks through arr to find genres that contain the given pattern 
    ans = []
    for val in arr:
        if re.search(pattern,val):
            ans.append(val)
    return ans
#Example: search_by_pattern(imdb_genres,"anim") 

In [791]:
search_by_pattern(imdb_genres,"family")

['family',
 'family drama',
 'family-oriented adventure',
 'family & personal relationships']

In [None]:
#make genre predictor/associated in the model?