In [1]:
import ast 
import csv
import numpy as np
from collections import Counter

In [2]:
# First lets parse the data
reader = csv.reader(open("movie_recommendations.csv", "rb"), delimiter=",")
data = list(reader)
print data

[['Movie Title ', 'Genre of Movie ', 'My Rating on IMDb', 'IMDb Rating ', 'Metacritic Score ', 'Rotten Tomato Freshness'], ['', '', '', '', '', ''], ['1) Confidence (2003)', 'Crime/Thriller/Heist', '7/10', '6.3/10', '59', '69%'], ['2) Monty Python and the Holy Grail (1975)', 'Comedy', '8/10', '8.3/10', '93', '97%'], ['3) God Bless America (2011) ', 'Comedy', '8/10', '7.2/10', '56', '67%'], ['4) Pitch Perfect (2012)', 'Comedy/Music', '8/10', '7.2/10', '66', '80%'], ['5) Seven Psycopaths (2012)', 'Comedy/Crime', '9/10', '7.2/10', '66', '82%'], ['6) How To Rob a Bank (2007)', 'Comedy/Crime/Heist', '7/10', '6.1/10', '27', '17%'], ['7) Con Air (1997)', 'Action', '8/10', '6.8/10', '52', '55%'], ['8) Hansel and Gretel: Witch Hunters (2013)', 'Action/Fantasy/Horror', '7/10', '6.1/10', '21', '14%'], ['9) Dredd (2012)', 'Action/Sci-Fi', '8/10', '7.1/10', '59', '78%'], ['10) Invincible (2006)', 'Biopic/Drama/Sport', '7/10', '7.1/10', '63', '72%'], ['11) Flight (2012)', 'Drama ', '7/10', '7.3/10',

In [3]:
# The first row has header info and the second row is empty, so we can ignore them.
# Note: the data is stored as strings, so we need to process it some more
text_data = np.array(data[2:])

movie_titles = text_data[:,0]
raw_movie_genres = text_data[:,1]
raw_omkar_ratings = text_data[:,2]
raw_imdb_ratings = text_data[:,3]
# -SOON->
# raw_meta_critic_ratings = result[:,4]
# raw_rotten_tomato_ratings = result[:,5]

In [4]:
# Now lets normalize these ratings so they are between 0 and 1
from __future__ import division # so that python will evaluate 3/10 as a floating pt operation instead of an integer op

def string_to_numpy(string_arr):
    tmp_list = []
    for string_val in string_arr:
        if string_val is 'N/A':
            tmp_list.append(0)
        else:
            tmp_list.append(eval(string_val))
    return np.asarray(tmp_list).astype("float")

omkar_ratings = string_to_numpy(raw_omkar_ratings)
imdb_ratings = string_to_numpy(raw_imdb_ratings)

In [5]:
# Now lets look at how well Omkar's rating correlates with IMDB across all genres
np.corrcoef(omkar_ratings, imdb_ratings)

array([[1.        , 0.64049621],
       [0.64049621, 1.        ]])

In [6]:
# Num unique genres
all_genres = []
for raw_genres in raw_movie_genres:
    genres = raw_genres.split('/')
    for genre in genres:
        word = genre.lower().strip()
        # spelling mistakes
        if word == 'crme':
            word = 'crime'
        elif word == 'myster':
            word = 'mystery'
        all_genres.append(word)

unique_genres = sorted(set(all_genres))
counts = Counter(all_genres)
print unique_genres
print counts

['action', 'adventure', 'animation', 'biography', 'biopic', 'blaxploitation', 'body horror', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'heist', 'horror', 'music', 'mystery', 'rom-com', 'romance', 'sci-fi', 'sport', 'thriller']
Counter({'action': 82, 'comedy': 76, 'drama': 74, 'adventure': 43, 'thriller': 34, 'sci-fi': 28, 'crime': 27, 'mystery': 19, 'horror': 15, 'rom-com': 12, 'fantasy': 9, 'animation': 5, 'sport': 4, 'heist': 4, 'biography': 3, 'romance': 3, 'music': 3, 'documentary': 3, 'biopic': 2, 'blaxploitation': 1, 'body horror': 1, 'film-noir': 1})


In [7]:
max_correlation = 0
max_corr_genre = 'N/A'

for genre in unique_genres:
    use = []
    for raw_genres in raw_movie_genres:
        use.append(genre in raw_genres.lower())
    if sum(use) < 3:
        print '> Genre "{}" has too few examples ({})'.format(genre, counts[genre])
        continue
    correlation = np.corrcoef(omkar_ratings[use], imdb_ratings[use])[0, 1]
    print 'Genre: {}, Num. data pts: {}, Correlation: {}'.format(genre, counts[genre], correlation)
    
    if correlation > max_correlation:
        max_correlation = correlation
        max_corr_genre = genre



Genre: action, Num. data pts: 82, Correlation: 0.607767210592
Genre: adventure, Num. data pts: 43, Correlation: 0.787136204089
Genre: animation, Num. data pts: 5, Correlation: 0.727451044419
Genre: biography, Num. data pts: 3, Correlation: 0.998221166691
> Genre "biopic" has too few examples (2)
> Genre "blaxploitation" has too few examples (1)
> Genre "body horror" has too few examples (1)
Genre: comedy, Num. data pts: 76, Correlation: 0.699150172174
Genre: crime, Num. data pts: 27, Correlation: 0.668399201081
Genre: documentary, Num. data pts: 3, Correlation: 1.40998465126e-15
Genre: drama, Num. data pts: 74, Correlation: 0.651078456144
Genre: fantasy, Num. data pts: 9, Correlation: 0.68665053402
> Genre "film-noir" has too few examples (1)
Genre: heist, Num. data pts: 4, Correlation: 0.971665620578
Genre: horror, Num. data pts: 15, Correlation: 0.716856681088
Genre: music, Num. data pts: 3, Correlation: 0.789865330519
Genre: mystery, Num. data pts: 19, Correlation: 0.580189889402
Ge

In [8]:
print "Max. correlated genre: {}, ({})".format(max_corr_genre, max_correlation)

Max. correlated genre: biography, (0.998221166691)
