# Gender Disparity in IMDB Ratings

Do you remember when they announced Ghostbusters was being remade with female leads? People seemed pretty upset about it. Almost immediately, after release, the films rating tanked. Was it a bad movie? Maybe. But a quick look at IMDB's ratings (see here: https://www.imdb.com/title/tt1289401/ratings?demo=imdb_users) shows that although 17% of raters gave it 1 star, 11% of users gave it ten stars. This type of disparity is pretty uncommon - ratings don't usually follow this barbell pattern. Digging deeper I realized that about 24% of women gave 10 stars but about 20% of all men gave 1 star. Because there were three times more men rating the film than women, ratings from men had a much more significant impact. This made me wonder if there are other films where there's a rating disparity between men and women. 

In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from scipy import stats

In [22]:
#Download the data
movie_graph_data = pd.read_csv('https://raw.githubusercontent.com/sampurkiss/movie_ratings_imdb/master/movie_ratings.csv')
#Add additional columns to database    
movie_graph_data['gender_ratings'] = movie_graph_data['no_of_male_ratings']+movie_graph_data['no_of_female_ratings']
movie_graph_data['ratings_differential'] = movie_graph_data['males']- movie_graph_data['females']


In [120]:
###############################################
#Get graph showing movie ratings by proportion of respective gender
#x-values are mens rating, y-values are womans ratings
###############################################

hover_text =[]
for i in range(0,len(movie_graph_data)):
    hover_text.append(('Movie: {movie}<br>'+
                       'Release date: {date}<br>'+
                       'Genre: {genre}').format(
                               movie = movie_graph_data['name'].iloc[i],
                               date = movie_graph_data['year'].iloc[i],
                               genre=movie_graph_data['genre'].iloc[i]))


traces =[]
for i in range(1,11):
    trace0 =go.Scatter(x = movie_graph_data['male_rating_'+str(i)]/movie_graph_data['no_of_male_ratings'],
                   y = movie_graph_data['female_rating_'+str(i)]/movie_graph_data['no_of_female_ratings'],
                   mode = 'markers',
                   text = hover_text,
                   name = str(i)+' stars')
    traces.append(trace0)

dissecting_line = go.Scatter(x =[0,2],
                            y = [0,2],
                            mode = 'lines',
                            name = 'Intersecting Line',
                            marker = dict(color = 'rgba(106,114,119,0.5)'))
traces.append(dissecting_line)

layout = go.Layout(title = 'Movie Ratings by Gender Proportion',
                   hovermode = 'closest',
                   showlegend = True,
                   xaxis = dict(title = 'Proportion of All Male Ratings',
                                range=[0, 1],
                               tickformat='.0%'),
                   yaxis = dict(title = 'Proportion of All Female Ratings',
                                range=[0, 1],
                               tickformat='.0%'))
fig = go.Figure(data = traces, layout = layout)

py.iplot(fig, filename='movie-ratings-by-gender')


In [86]:
###########################################################
#Distribution of women rating movies vs. men
###########################################################

x0 = movie_graph_data['no_of_female_ratings']/movie_graph_data['gender_ratings']
data = go.Histogram(x = x0)

layout = go.Layout(title = 'Proportion of women giving ratings',
                   xaxis = dict(title = 'Proportion of Female Raters',
                                range = [0,1],
                               tickformat='.0%'),
                  yaxis=dict(title='Number of Movies'))

fig = go.Figure(data = [data], layout = layout)

py.iplot(fig, filename='proportion-of-female-raters')

In [119]:
#Get ranking differentials
differentials = movie_graph_data
differentials['male_ranking']= differentials['males'].rank()
differentials['female_ranking']= differentials['females'].rank()
differentials = differentials.query("male_ranking<250|female_ranking<250")

trace = go.Scatter(y= differentials['ranking_differential'],
                  x = differentials['no_of_female_ratings']/differentials['no_of_ratings'],
                  text = differentials['name'],
                  mode='markers',
                  name ='')

layout = go.Layout(title = 'Ranking differential by proportion of films rated by women',
                   hovermode = 'closest',
                   showlegend = False,
                  xaxis = dict(tickformat='.0%',
                              title = 'Proportion of Female Ratings'),
                  yaxis=dict(tickformat='0,000',
                            title='Ranking differential'))

fig = go.Figure(data = [trace], layout = layout)

py.iplot(fig, filename = 'rating-differential')

In [64]:
###########################################################
#Rating differentials between women and men
###########################################################


ratings_diff = movie_graph_data[['no_of_female_ratings','gender_ratings','ratings_differential']]
ratings_diff['proportion'] = (ratings_diff['no_of_female_ratings']/ratings_diff['gender_ratings']/5).round(2)*5
ratings_diff = ratings_diff.groupby(['proportion'])[['proportion','ratings_differential']].agg(['mean','count'])
ratings_diff['films'] = ['Number of films: ' + str(ratings_diff['proportion']['count'].iloc[i]) for i in range(0,len(ratings_diff))]

trace = go.Scatter(x = ratings_diff['proportion']['mean'],
                   y = ratings_diff['ratings_differential']['mean'],
                   mode = 'lines',
                   text = ratings_diff['films'],
                   name = '')

layout = go.Layout(title = 'Ratings differential by proportion of female raters',
                   showlegend = False,
                    xaxis = dict(title = 'Proportion of Female Raters',
                                 range=[0, .75],
                                tickformat ='.0%'),
                       yaxis = dict(title = 'Ratings differential',
                    range=[-2, 2]))
                    
fig= go.Figure(data = [trace], layout = layout)

py.iplot(fig, filename = 'ratings-differentials-female')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [103]:
###########################################################
#Ratings histogram
###########################################################

ratings = movie_graph_data[['male_rating_10', 'male_rating_9', 'male_rating_8', 'male_rating_7',
       'male_rating_6', 'male_rating_5', 'male_rating_4', 'male_rating_3',
       'male_rating_2', 'male_rating_1', 'female_rating_10', 'female_rating_9',
       'female_rating_8', 'female_rating_7', 'female_rating_6',
       'female_rating_5', 'female_rating_4', 'female_rating_3',
       'female_rating_2', 'female_rating_1']].aggregate(sum)
ratings = pd.DataFrame(ratings)
ratings.reset_index(inplace=True)
ratings = ratings.rename(columns={'index':'type',0:'number'})

rating_range = []
for i in range(10, 0, -1):
    rating_range.append(i)

trace = go.Bar(x = rating_range,
                     y = ratings['number'].iloc[0:10]/sum(ratings['number'].iloc[0:10]),
                     name = "Mens")
trace2 = go.Bar(x = rating_range,
                     y = ratings['number'].iloc[10:20]/sum(ratings['number'].iloc[10:20]),
                     name = "Womens")
    
layout = go.Layout(title='Ratings distribution by gender',
                  yaxis=dict(title='Proportion of Respective Gender',
                            tickformat='.0%'),
                  xaxis=dict(title='Star Rating Out of 10')) 
    
fig = go.Figure(data = [trace, trace2], layout=layout)
py.iplot(fig, filename = 'rating-distribution')



In [85]:
###########################################################
#Box office returns for female rated movies
###########################################################
box_office = movie_graph_data
#Get average box office return by rating
box_office['proportion_female'] = (box_office['no_of_female_ratings']/\
                                         box_office['gender_ratings']/5).round(2)*5

#Note: need to filter out films without any box office information
box_office=(box_office[box_office['gross_worldwide']>0][['gross_worldwide','proportion_female']]
            .groupby(by=['proportion_female'])
            .agg(['mean','count'])
            .reset_index())


trace = go.Scatter(x = box_office['proportion_female'],
                   y = box_office['gross_worldwide']['mean'],
                   mode = 'lines',
                   text = ['Number of films: '+str(num) for num in box_office['gross_worldwide']['count']],
                   name = '')


layout = go.Layout(title = 'Average box office amount by proportion of female raters',
                   showlegend = False,
                   hovermode = 'closest',
                    xaxis = dict(title = 'Proportion of Female Raters',
                                 range=[0, .75],
                                tickformat ='.0%'),
                       yaxis = dict(title = 'Box Office Amount'))
                    
fig= go.Figure(data = [trace], layout = layout)

py.iplot(fig, filename = 'box-office-revenue-female')

Unnamed: 0.1,Unnamed: 0,id,name,genre,year,gross_usa,gross_worldwide,imdb_users,no_of_ratings,aged_under_18,...,female_rating_4,female_rating_3,female_rating_2,female_rating_1,gender_ratings,ratings_differential,proportion_female,male_ranking,female_ranking,ranking_differential
84,0,tt0082198,Conan the Barbarian,"Adventure,Fantasy",1982.0,39565475.0,2.928600e+07,6.9,126026.0,6.7,...,476.0,269.0,190.0,289.0,102776.0,0.6,0.05,623.5,207.5,416.0
109,0,tt0089530,Mad Max Beyond Thunderdome,"Action,Adventure,Sci-Fi",1985.0,36200000.0,,6.3,115804.0,6.6,...,423.0,224.0,117.0,156.0,94414.0,-0.2,0.10,222.0,256.0,-34.0
110,0,tt0089880,Rambo: First Blood Part II,"Action,Adventure,Thriller",1985.0,150415432.0,3.004004e+08,6.4,134939.0,6.8,...,335.0,155.0,124.0,190.0,109605.0,0.3,0.05,382.5,167.0,215.5
139,0,tt0095956,Rambo III,"Action,Adventure,Thriller",1988.0,53715611.0,1.890156e+08,5.8,106345.0,6.0,...,341.0,190.0,114.0,219.0,86274.0,0.2,0.05,99.0,38.5,60.5
141,0,tt0096320,Twins,"Comedy,Crime",1988.0,111936400.0,1.047000e+08,6.0,100907.0,6.3,...,879.0,408.0,223.0,192.0,79837.0,0.1,0.10,149.0,80.0,69.0
155,0,tt0099938,Kindergarten Cop,"Comedy,Crime",1990.0,91457688.0,2.019577e+08,6.1,122534.0,6.7,...,934.0,375.0,264.0,200.0,98445.0,-0.1,0.15,180.0,167.0,13.0
158,0,tt0100403,Predator 2,"Action,Horror,Sci-Fi",1990.0,30669413.0,5.712032e+07,6.3,129969.0,6.0,...,396.0,211.0,145.0,146.0,103895.0,0.2,0.05,269.5,131.5,138.0
159,0,tt0100507,Rocky V,"Drama,Sport",1990.0,40946358.0,1.199464e+08,5.2,111780.0,5.8,...,573.0,268.0,161.0,194.0,88693.0,-0.4,0.05,24.5,38.5,-14.0
170,0,tt0103644,Alien³,"Action,Horror,Sci-Fi",1992.0,55473600.0,1.597736e+08,6.4,249120.0,5.9,...,1325.0,1443.0,1408.0,1798.0,198718.0,0.1,0.10,324.0,207.5,116.5
173,0,tt0103855,The Bodyguard,"Drama,Music,Romance",1992.0,121945720.0,4.109000e+08,6.2,105664.0,6.7,...,1091.0,533.0,281.0,280.0,84893.0,-0.5,0.25,180.0,373.0,-193.0
