In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

From the data of highest grossing films, data analysis is performed to discern the effect of multiple factors on the worldwide gross of films. 

Please comment with suggestions for improvements in this project, if required.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Now, we'll create a DataFrame using our data file.

In [None]:
data = pd.read_csv("/kaggle/input/top-10-highest-grossing-films-19752018/blockbusters.csv")
data.head()

[](http://)1. Which main_genre has collectively made the highest amount of money? Which genre made the highest average amount of money?

First, we'll change the worldwide gross from object type to float type

In [None]:
data['worldwide_gross'] = data['worldwide_gross'].str.replace('$', '').str.replace(',', '').astype(float)
data.head()

Now, let's fix the dataframe by filling the missing values.

In [None]:
data.isna().sum()

There are 29 & 141 missing values in Genre_2 and Genre_3 respectively. Maybe the films had only one or no sub genres. Let's fix that.

In [None]:
data = data.fillna('No Sub Genre')

In [None]:
data.head()

Now, we'll be grouping the Main_Genre and finding the sum of worldwide gross.

In [None]:
highest_grossing = data.groupby('Main_Genre').sum()['worldwide_gross'].reset_index().sort_values(by='Main_Genre', ascending=False)
display(highest_grossing)
print("Highest Grossing Main Genre is: ")
display(highest_grossing[highest_grossing.worldwide_gross == highest_grossing.worldwide_gross.max()])

Fantasy collectively made the highest amount of money. But how much was the count of movies from each genre?

In [None]:
count_title = data.groupby('Main_Genre').count().sort_values(by='Main_Genre', ascending=False)['title']
count_title

Creating numpy array in order to calculate the average gross in each genre.

In [None]:
np_highest_grossing = np.array(highest_grossing.worldwide_gross)
np_count_title = np.array(count_title)
np_index = np.array(count_title.index)
np_average_amount_per_genre = np_highest_grossing/np_count_title

Average Amount Grossed for each Main Genre

In [None]:
np_result = np.vstack((np_index, np_average_amount_per_genre)).transpose()
result = pd.DataFrame({'Genre' : np_result[: , 0], 'Average Amount Grossed' : np_result[:, 1]})
result

In [None]:
result.loc[result['Average Amount Grossed'] == result['Average Amount Grossed'].max()]

Highest Average Amount was grossed by Fantasy. Let's visualize our findings for a better understanding. First, we'll create a plot of Film Genres with the sum of their worldwide gross.

In [None]:
plt.figure(figsize = (20,7))
sns.set_style("darkgrid")
sns.set_palette("PRGn")
sns.catplot(x="Main_Genre", y="worldwide_gross", data=data, kind="bar", estimator=sum, ci=None)
plt.xticks(rotation=90)
plt.title('Film Genres with their worldwide gross sum')
plt.xlabel('Main Genre')
plt.ylabel('Worldwide Gross Sum')

Fantasy has the highest grosing films. Next, we'll find the count of each of the genres.

In [None]:
plt.figure(figsize = (20,7))
sns.set_style("darkgrid")
sns.catplot(x="Main_Genre", data=data, kind="count")
plt.xticks(rotation=90)
plt.title('Count of Films in each Genre')
plt.xlabel('Main Genre')
plt.ylabel('No. of Films')

Number of films in Sci-Fi, Romance, Fantasy and Comedy are almost similar. Thriller has higher number of films. Now let's see their Average worldwide gross.

In [None]:
plt.figure(figsize=(20,7))
sns.set_style("darkgrid")
sns.catplot(x='Genre', y='Average Amount Grossed', data=result, kind='bar')
plt.xticks(rotation=90)
plt.title('Average Amount grossed per film')
plt.xlabel('Main Genre')
plt.ylabel('Average Amount Grossed per film')

We see that Fantasy films are most watched films as even though they have a large number of films, their average worldwide gross remains at the top.

2. Do the IMDB Ratings of the film affect its worldwide gross?

In [None]:
film = data.loc[:, ['Main_Genre','title', 'imdb_rating', 'worldwide_gross']].sort_values(by='imdb_rating')
film

In [None]:
plt.figure(figsize=(10,3))
sns.set_style("darkgrid")
g=sns.lineplot(x='imdb_rating', y='worldwide_gross', data=film, ci=None)
g.set_title("IMDB Ratings vs Worldwide Gross")
g.set(xlabel="IMDB Rating", ylabel="Worldwide Gross")

From this graph, we don't a well defined relationship between IMDB Ratings and the worldwide gross as we can see that it varies greatly. Some films with a rating of less than 5 grossed more than films with a rating between 5 and 8. Let's look at a joint plot to get some clarity.

In [None]:
plt.figure(figsize=(30,7))
sns.set(style="ticks")
sns.jointplot(x=film.imdb_rating, y=film.worldwide_gross, kind= 'reg', color = '#4CB391').set_axis_labels("IMDB Ratings", "Worldwide Gross")

We see a positive regression line which means that as IMDB Ratings of a film increases, the Worldwide Gross of the film also increases.

3. Which rating has the highest average gross?

In [None]:
data_pivot_table = data.pivot_table(index='Main_Genre', values='worldwide_gross', columns = 'rating', fill_value = 0, margins=True)
data_pivot_table

In [None]:
max_avg_gross = data_pivot_table.loc['All']
max_avg_gross[max_avg_gross == max_avg_gross.max()]

In [None]:
plt.figure(figsize=(5,5))
sns.set_style("darkgrid")
sns.set_palette("PRGn")
sns.barplot(x=data.rating, y=data.worldwide_gross, ci=None)
plt.xlabel('Rating')
plt.ylabel('Worldwide Gross')
plt.title('Rating vs. Worldwide Gross')

Hence, we see that, on an average, films with ratings of PG-13 tend to make more money as compared to other films.