In [6]:
import os
from math import pi
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
%matplotlib inline
#set directory to ZippedData
#os.chdir(r"zippedData")

In [None]:
#load files
movie_budget=pd.read_csv("tn.movie_budgets.csv.gz") #budget and gross revenue of movies
spy_monthly=pd.read_csv("SPY.csv") #stock market monthly returns
GDP=pd.read_csv('GDP_DEF.csv') #GDP deflator to adjust numbers by inflation

In [None]:
#dealing with dates and trim off 2019
movie_budget.rename(columns={"movie":"title"}, inplace=True)
movie_budget['release_date']=pd.to_datetime(movie_budget['release_date'])
movie_budget['view_date']=movie_budget.release_date+pd.offsets.MonthBegin(1)
movie_budget['year']=movie_budget['view_date'].dt.year
movie_budget=movie_budget[movie_budget['year']<=2018]

In [None]:
#spy_monthly contains dates only beyond 2000, by default the merged file has a time frame between 2000 to 2018
spy_monthly['view_date']=pd.to_datetime(spy_monthly['Date'])
spy_monthly.sort_values(["view_date"],inplace=True)
spy_monthly['percent_return_month']=(spy_monthly['Adj Close'].shift(-1)-spy_monthly['Adj Close'])/spy_monthly['Adj Close']*100
spy_monthly['percent_return_rolling']=spy_monthly.percent_return_month.rolling(2).mean()
spy_monthly['market_condition']=pd.cut(spy_monthly['percent_return_rolling'], bins=3, labels=['Low','Medium','High'])
movie_budget=movie_budget.merge(spy_monthly[['view_date','percent_return_month','percent_return_rolling',"market_condition"]], on="view_date")
movie_budget=movie_budget.merge(GDP, on='year')

In [None]:
to_replace_list=["domestic_gross","worldwide_gross","production_budget"]
for i in to_replace_list:
    movie_budget[i]=movie_budget[i].str.replace(r"$","")
    movie_budget[i]=movie_budget[i].str.replace(r",","")

In [None]:
for i in to_replace_list:
    movie_budget[i]=movie_budget[i].astype("int64")/movie_budget['nondurable_goods_expenditure']*100

In [None]:
movie_budget['domestic_profit']=(movie_budget['domestic_gross']-movie_budget['production_budget'])
movie_budget['foreign_profit']=(movie_budget['worldwide_gross']-movie_budget['domestic_gross'])
movie_budget['profit_margin']=(movie_budget['worldwide_gross']-movie_budget['production_budget'])/(movie_budget['production_budget'])*100
movie_budget['domestic_margin']=(movie_budget['domestic_profit'])/(movie_budget['production_budget'])*100
movie_budget['foreign_margin']=(movie_budget['foreign_profit'])/(movie_budget['production_budget'])*100

In [None]:
movie_budget['production_budget_rank']=movie_budget.groupby(['year'])['production_budget'].rank(pct=True)
movie_budget['budget_size']=pd.cut(movie_budget['production_budget_rank'], bins=2, labels=["Independent","Mass Production"])
movie_budget['count']=movie_budget.groupby(['view_date'])['title'].transform("count")
movie_budget['competition_size']=pd.cut(movie_budget['count'], bins=4, \
                                   labels=['Low','Moderately Low','Moderately High','High'])

In [None]:
movie_budget.hist('count',by="budget_size")
movie_budget.to_pickle(r"..//merged_file.pickle")

In [7]:
movie_budget=pickle.load(open("merged_file.pickle","rb"))

FileNotFoundError: [Errno 2] No such file or directory: 'merged_file.pickle'

In [None]:
plt.clf()
plt.figure(figsize=(18, 12))
sns.boxplot('competition_size','profit_margin', data=movie_budget, hue="budget_size", showfliers=False)
plt.xlabel("Competition Level", size=20)
plt.ylabel("Profitability", size=20)
plt.legend(fontsize="xx-large")
plt.title("Competition and Profitability for Independent vs Mass Production Movies", size=30)
plt.savefig("..\\competition and profitability.png")

In [None]:
plt.clf()
plt.figure(figsize=(18, 12))
sns.boxplot('market_condition','domestic_margin', data=movie_budget, hue="budget_size", showfliers=False)
plt.xlabel("Market Condition", size=20)
plt.ylabel("Profitability", size=20)
plt.legend(fontsize="xx-large")
plt.title("Market Condition and Profitability for Independent vs Mass Production Movies", size=30)
plt.savefig("..\\Market and profitability.png")

In [None]:
movie_budget['month']=movie_budget.view_date.dt.strftime("%B")
movie_budget['month_num']=movie_budget.view_date.dt.month
movie_budget_agg=movie_budget.groupby("month").\
        agg({"title":"count","profit_margin":"mean","month_num":"min"}).reset_index()
movie_budget_agg=movie_budget_agg.sort_values("month_num")

In [None]:
movie_budget_agg

In [None]:
plt.clf()
plt.figure(figsize=(18,12))
bottom = 8
max_height = 4
N=12
theta = [n / float(N) * 2 * pi for n in range(N)]
width = (2*pi) / N

radii=movie_budget_agg["title"].to_list()
radiii=movie_budget_agg["profit_margin"].to_list()
ax = plt.subplot(111, polar=True)

bars = ax.bar(theta, radii, width=width, bottom=bottom, label='# of Movie Release')
ax.plot(theta, radiii, linewidth=2, linestyle='solid',color="orange", label='Average Movie Profitability')
ax.plot([theta[-1],theta[0]], [radiii[-1],radiii[0]], linewidth=2, linestyle='solid', color="orange")
plt.xticks(theta, movie_budget_agg["month"], color='grey', size=20)
plt.fill(theta, radiii, 'b', alpha=0.1)

# Use custom colors and opacity
for r, bar in zip(radii, bars):
    bar.set_facecolor(plt.cm.jet(r / 10.))
    bar.set_alpha(0.7)

ax.legend(fontsize="xx-large")
plt.title("Number of Movie Release vs. Profitability", size=30)
plt.savefig("..//release_profitability_month.png")