# Importing the packages and data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


data_types = {'release_year': str,
              'gross_earnings': float,
              'budget': float}

file = '/Users/pedro_andrade/Desktop/Intro to Python/movies_unlocked.xlsx'

df = pd.read_excel ( io = file,
                     sheet_name = 0,
                     dtype = data_types,
                     header = 0)
print(df.info())

df.head()

In [None]:
# Applying the heurestic rule on the dataset

heuristic_rule = 9 / 1677 * 100
rounded_rule = round(heuristic_rule, ndigits = 2)

print('-'*40)

print(df.isnull().sum(axis = 0))

print('-' * 40, '\n')


print(f"""The heuristic rules applies in this case because the missing
values represent only {rounded_rule}% of the observations!""")


In [None]:
# Dropping missing values
df_dropped = df.dropna()

# Printing the results
print(df_dropped.isnull().sum())

# Descriptive Statistics of all movies

In [None]:
# descriptive statistics
df_dropped.describe(include = 'number').round(decimals = 2)

# Distribution of Movies

In [None]:
# Average User Rating Distribution

fig, ax = plt.subplots(figsize = (8 ,8))

sns.histplot(data = df_dropped,
             x = 'average_user_rating',
             bins = 'fd',
             kde = True,
             color = 'black')

# title and labels
plt.title("""Distribution of movies by 
Average User Rating""")
plt.xlabel("Average User Rating")
plt.ylabel("# Movies")

# adding mean and median (vertical lines)
plt.axvline (x = df_dropped['average_user_rating'].mean(),
             color = 'red')
plt.axvline (x = df_dropped['average_user_rating'].median(),
             color = 'blue')

plt.legend (labels = ['mean','median'])

plt.show()

In [None]:
# IMDB Score Distribution

fig, ax = plt.subplots(figsize = (8 ,8))

sns.histplot(data = df_dropped,
             x = 'imdb_score',
             bins = 'fd',
             kde = True,
             color = 'green')

# title and labels
plt.title("Distribution of movies by IMDB Score\n")
plt.xlabel("IMDB Score")
plt.ylabel("# Movies")

# adding mean and median (vertical lines)
plt.axvline (x = df_dropped['imdb_score'].mean(),
             color = 'red')
plt.axvline (x = df_dropped['imdb_score'].median(),
             color = 'blue')

plt.legend (labels = ['mean','median'])

plt.show()


# Correlation Matrix

In [None]:
data_corr = df_dropped.corr(method = 'pearson').round(decimals = 2)

fig, ax = plt.subplots(figsize = (8 ,8))

sns.heatmap( data = data_corr,
             cmap = 'Blues',
             square = True,
             annot = True,
             linecolor = 'black',
             linewidth = 0.5)

plt.title(label = 'Correlation of the Movies dataset\n')

plt.show()

In [None]:
help(sns.lmplot)

In [None]:
# Scatter plot of IMDB score and AVG_User_rating

sns.lmplot ( x = 'imdb_score',
             y = 'average_user_rating',
             hue        = None,
             scatter    = True, 
             fit_reg    = True, 
             aspect     = 3, 
             data       = df_dropped)

plt.title(label  = 'IMDB and Average User Rating\n')
plt.xlabel(xlabel = 'IMDB Score')
plt.ylabel(ylabel = 'Average User Rating')
plt.xlim(left   = 0, right = 10)
plt.tight_layout(pad    = 1)

#Showing the plot
plt.show(block  = True)


In [None]:
# Scatter plot of Gross Earnings and Number of User Votes

sns.lmplot ( x = 'num_user_votes',
             y = 'gross_earnings',
             hue        = None,
             scatter    = True, 
             fit_reg    = True, 
             aspect     = 3, 
             data       = df_dropped)

plt.title(label  = 'Gross Earnings and Number of User Votes\n')
plt.xlabel(xlabel = 'Number of User Votes')
plt.ylabel(ylabel = 'Gross Earnings')

plt.tight_layout(pad    = 1)

#Showing the plot
plt.show(block  = True)


# Descriptive Statistics of Sports Movies

In [None]:
# There are only 10 movies inside the "Sports" genre.
df_sports = df_dropped[df_dropped['genre'].str.contains('Sport')]
df_sports

In [None]:
# Analyzing the descriptive of Sports
df_sports.describe().round(decimals = 2)

# Correlation Matrix - Sports

In [None]:
corr_sports = df_sports.corr(method = 'pearson').round(decimals = 2)

fig, ax = plt.subplots(figsize = (8 ,8))

sns.heatmap(data = corr_sports,
            cmap = 'coolwarm',
            square = True,
            annot = True,
            linecolor = 'red',
            linewidth = 0.5)

plt.title(label = 'Correlation of the Sports Movie\n')


plt.show()

In [None]:
# 4. Qualitative aspects: values? true stories? impact in real life?

In [None]:
help(sns.lmplot)

# Analysis of Sports Movies Genre in comparison with Netflix movie Database

# Introduction

Everyone loves a good story. Who does not like to listen to a friend's story about their lives, or an accomplishment? Specially in the USA, some of those accomplishments are often related to sports, in high school, college, or even professionally. The movie industry is certainly taking advantage of this, and producing amazing movies with a sports background that are getting the attention of the audience according to the Netflix database that is about to be presented. Are you ready to go deeper in this world?

# Insights

After a throughout analysis of the movie database and the Sport genre, three points caught my attention:
1. The genre Sport generates more Gross Earnings, proportionally to their Budget, in comparison with all other movies.
2. Sports movies have higher evaluation rates (IMDB and User Ratings).
3. Real life stories - biographies related sports movies, have higher evaluation scores than other genres.

In [None]:
#Insight 1 - Sports Movie generate more earnings per budget.

avg_budget_sports = df_sports['budget'].mean()
avg_budget_movies = round(df_dropped['budget'].mean(), ndigits = 2)
avg_gross_earn_sports = df_sports['gross_earnings'].mean()
avg_gross_earn_movies = round(df_dropped['gross_earnings'].mean(), ndigits = 2)

gross_budget_ratio_sports = round(avg_gross_earn_sports / avg_budget_sports, ndigits = 2) # greater earnings
gross_budget_ratio_movies = round(avg_gross_earn_movies / avg_budget_movies, ndigits = 2)

print('-' * 40, '\n')

print(f"""The ratio between Gross Earnings and Budget for sports movies: {gross_budget_ratio_sports}
The ratio between Gross Earnings and Budget for all movies: {gross_budget_ratio_movies}
""")



# Insight #1 - Explanation

It is interesting to notice that the genre sports have smaller budgets when compared to the average budget of all the other movies on Netflix. But that does not mean that sports movies do not make money in the end. In fact that is completely the opposite. The average gross earning of sports movies represents 66% more of their original budget! That means that for every dollar spent on the production of the movie, the movie generates $1.66 dollars in total revenue. The other movie genres only generates $1.04 per dollar spent, only 4% of their initial budget.

We can definetely say that sports movies are much more cost efficient, generating more money per dollar spent that all the other movies. Sports genre are an interesting option to invest when deciding to produce a new movie!

In [None]:
# Insight 2

avg_imdb_sports = df_sports['imdb_score'].mean()
avg_avg_user_sports = df_sports['average_user_rating'].mean()
avg_imdb_movies = round(df_dropped['imdb_score'].mean(), ndigits = 2)
avg_user_movies = round(df_dropped['average_user_rating'].mean(), ndigits = 2)

print('-' * 40, '\n')

print(f"""The average IMDB Score for sports movies: {avg_imdb_sports}
The average IMDB Score for all movies: {avg_imdb_movies}
\n""")

print('-' * 40, '\n')

print(f"""The average User Rating Score for sports movies: {avg_avg_user_sports}
The average User Rating Score for all movies: {avg_user_movies}
\n""")


# Insight #2 and #3 - Explanation

It is a trend and people are talking about them. Sports movies are becoming more and more popular and are part of the American society and culture. As Bonnet(2017) said, "Nowadays, the belief in sports’ social role is still lasting: it is thought to bolster the teaching of and respect for values which are essential to Americanness, such as competitiveness, equal opportunities, or social mobility." 

Firestein (2007) defined: "Sports are part of the very fabric of American life, discourse, and lexicon [...]. The centrality of sports in American life is amply reflected in contemporary American cinema, (with) films featuring virtually every major sport, from football, basketball, baseball, and hockey, to boxing, horse racing, and even surfing."

These explain exactly why sports movies have higher scores, both at the IMDB website and User Ratings. When we look at the numbers, sports movie IMDB scores average 6.94 (out of 10) while the average of all movies is only 6.45. Users also evaluate sports movie better in average, rating 7.07, when the average of all movies is 6.41. 

Not only that. The numbers also confirm the preference for real-life stories, as the sports movies based on biographies have higher IMDB Score and User Ratings than the other sub-genres of sports and the average of all movies! For example the movie "42", a Biography/Sport movie released in 2013, in the US. The movie is about baseball player Jackie Robinson, who was the first black athlete to play in Major League Baseball in the modern era (late 19th and early 20th centuries). This by itself brings a lot of social themes to debate, especially in context that we are still battling racial issues in the 21st century, more than 70 years later.

This movie has a rating of 7.5, both at IMDB and User Ratings, comparing to 6.94 (Sports IMDB average) and 7.07(User Rating Sports average). 




# References:
    
Firestein, D. J..“Fields of Dreams: American Sports Movies.” E journal USA, 12 (6), 2007, 9.

Bonnet, V. "Sport in Films: Symbolism versus Verismo. A France-United States Comparative Analysis", InMedia [En ligne], 6 | 2017, mis en ligne le 18 décembre 2017, consulté le 01 juin 2022. URL : http://journals.openedition.org/inmedia/883 ; DOI : https://doi.org/10.4000/inmedia.883

https://en.wikipedia.org/wiki/42_(film)