In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 

In [None]:
!ls zippedData/

In [None]:
# What is the correlation between production budget 
# and world wide gross?

In [None]:
# Below is the csv file called TN_data that we used to look into the data behind budgets for movies and the correlation to gross profit that those budgets had.
TN_data = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

In [None]:
TN_data

In [None]:
TN_data.info()

In [None]:
TN_data.describe()

In [None]:
# Removed dollar sign to change string into an integer.
TN_data['production_budget'] = TN_data['production_budget'].str.replace('$' , '')

In [None]:
# Removed comma to change string into an integer.
TN_data['production_budget'] = TN_data['production_budget'].str.replace(',' , '')

In [None]:
# Set type as an integer instead of a string.
TN_data['production_budget'] = TN_data['production_budget'].astype(int)

In [None]:
# Removed dollar sign to change string into an integer. 
TN_data['worldwide_gross'] = TN_data['worldwide_gross'].str.replace('$' , '')

In [None]:
# Set type as an integer instead of a string. 
TN_data['worldwide_gross'] = TN_data['worldwide_gross'].str.replace(',' , '').astype('int64')

In [None]:
TN_data.info()

In [None]:
x = list(TN_data['worldwide_gross'].values)

In [None]:
y = list(TN_data['production_budget'].values)

In [None]:
plt.show()

In [None]:
# Set x and y axes with the above data as values thata re displayed below.
fig, ax = plt.subplots(figsize=(20,10))
ax.scatter(TN_data['worldwide_gross'],TN_data['production_budget'], color='blue', 
                        alpha=.6, edgecolor='black', linewidth=.5, s=100)

ax.set_title('Production Budget/Worldwide Gross Correlation', fontsize = 35)
ax.set_ylabel('Worldwide Gross (Billions $)', fontsize = 30)
ax.set_xlabel('Production Budget (Billions $)', fontsize = 30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

;


fig.patch.set_facecolor('white')
fig.patch.set_alpha(1)

fig.savefig('production_budget_worldwide_gross_correlation.png', facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
# Question #3: What is the most popular genre? (genre vs poularity)

In [None]:
# Genre and Popularity measurements were found in the TMDB Database
TMDB_data = pd.read_csv("zippedData/tmdb.movies.csv.gz")

In [None]:
# We used this data to see the popularity and release date as well as how exactly the vote count and average was broken down.
TMDB_data

In [None]:
TMDB_data.info()

In [None]:
TMDB_data.describe()

In [None]:
# In order to utilize the genre information, it needed split apart and exploded.
TMDB_data["genre_ids"] = TMDB_data["genre_ids"].str.replace("[","").str.replace("]","").str.replace(",","").str.split(" ")

In [None]:
TMDB_genre = TMDB_data.explode(column="genre_ids")

In [None]:
# Then we found the avg popularity for each genre and returning a list of the top 10 most popular genres 
# over the 8-year span. 
genres = TMDB_genre.groupby('genre_ids')['popularity'].mean().sort_values(ascending=False).head(10)

In [None]:
#  Using the code on the TMDB website, we converted the top 10 genres from numeric code to name associated with the number. 
lst_of_genres = ['Adventure', 'Action', 'Fantasy', 'Crime', 'War', 'Science Fiction', 'Thriller', 'Mystery', 'Family', 'Animation'] 

In [None]:
genres.index = lst_of_genres
genres.index

In [None]:
# We used a Histogram to visualize the popularity of the top ten genres.
x = genres.index
y = genres.values
plt.figure(figsize=(10,6))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("Popularity by Genre (over 8 years)", fontsize=25)
plt.ylabel("AVG Popularity Rating", fontsize=20)
ax= genres.plot.bar(x=x, y=y, rot=70)

fig.patch.set_facecolor('white')
fig.patch.set_alpha(1)

plt.savefig('popularity_by_genre.png', facecolor=fig.get_facecolor(), edgecolor='none', bbox_inches="tight")

In [None]:
# What is the correlation between popularity and release date? 

In [None]:
# Set the data we used for the popularity and month released to determine which month was the most popular month for movies on average.
TMDB_data = pd.read_csv("zippedData/tmdb.movies.csv.gz", parse_dates = ['release_date'])

In [None]:
# We used this data to see the popularity and release date as well as how exactly the vote count and average was broken down.
TMDB_data

In [None]:
type(TMDB_data['release_date'][0])

In [None]:
# Sorted values in order from highest to lowest in regards to vote count
TMDB_data.sort_values('vote_count', ascending = False)

In [None]:
TMDB_data.info()

In [None]:
# Targeted the month column and more specifically the month column so that we could determine what month would be the most and least popular month to release a movie.
TMDB_data['month'] = TMDB_data['release_date'].dt.month

In [None]:
# Set the average popularity per month in order from highest to lowest
popularity_by_month = TMDB_data.groupby('month')['popularity'].mean().sort_values(ascending=False)

In [None]:
popularity_by_month = popularity_by_month.reset_index()

In [None]:
popularity_by_month

In [None]:
popularity_by_month = popularity_by_month.sort_values("month")

In [None]:
# Set the x label months by the words of the months instead of the numbers. 
month_labels = ['Jan','Feb', 'Mar', 'Apr', 'May', 'June', 'July', 'Aug', 'Sept', 'Oct', 'Nov', 'Dec']

In [None]:
# Set the bar plot
plot_one_fig, plot_one_ax = plt.subplots(figsize=(10,6))
# Used month and popularity as the targeted columns and the color as green.
plot_one_ax.bar(popularity_by_month['month'], popularity_by_month['popularity'], color = 'green')
# Set fontsizes and labels for the axes and title.
plot_one_ax.set_title(('Best Month to Release a Movie'), fontsize = 28)
plot_one_ax.set_ylabel(('Avg. Popularity'), fontsize = 25)
plot_one_ax.set_xlabel(('Month Released'), fontsize = 25)
plt.xticks(ticks = popularity_by_month['month'], labels = month_labels, fontsize=17)
plt.yticks(fontsize=17)


plot_one_fig.patch.set_facecolor('white')
plot_one_fig.patch.set_alpha(1)

plot_one_fig.savefig('popularity_by_month.png', facecolor=plot_one_fig.get_facecolor(), edgecolor='none')

