# Exploratory Data Analysis (EDA) + some additional cleaning & feature engineering

## Imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import scipy.linalg
import seaborn as sns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [None]:
pd.set_option('float_format', '{:f}'.format)  #so that floats will print w/o scientific notat.

In [None]:
os.getcwd();

In [None]:
df = pd.read_csv('/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/clean_data_v1.csv')

In [None]:
df = df.drop("Unnamed: 0", axis=1) #drop superfluous index col

In [None]:
df['release_date'] = pd.to_datetime(df['release_date']) #convert release from obj to datetime

In [None]:
#how many movies were profitable?
df[df['made_money'] == 1].count();
#answer: 2525 out of 3600 were profitable, so the data is somewhat imbalanced

## Check for correlated variables (collinearity) & correlations with profit/made_money (predictors of profitability)

#### Some collinearity here: I'll address this with regularization when modeling, or by using just one of the correlated features from each "group" of correlated features

In [None]:
df.corr(method='pearson')

## Plot numerical cols (x-axes) against profits (y-axis, in $)

#### Looks like weak correlations in general, but not non-existent. Hopefully enough for a useful model later...

In [None]:
num_cols = ['budget', 'revenue', 'runtime', 'profit', 'made_money',
       'releases', 'cast_rev', 'cast_prof', 'cast_films', 'cast_prof_films', 'dir_rev', 'dir_prof',
       'dir_films', 'dir_prof_films', 'writ_rev', 'writ_prof', 'writ_films', 'writ_prof_films',
       'compet_cast_rev', 'compet_cast_prof', 'compet_cast_films', 'compet_cast_prof_films',
       'compet_dir_rev', 'compet_dir_prof', 'compet_dir_films', 'compet_dir_prof_films',
       'compet_writ_rev', 'compet_writ_prof', 'compet_writ_films', 'compet_writ_prof_films']

fig, axs = plt.subplots(6, 5, figsize=(20, 25))

for ax, col in zip(axs.flatten(), num_cols):
    ax.scatter(df[col], df['profit'])
    ax.set_xlabel(col)

plt.tight_layout() #can save the image below / zoom in for visibility

## Percentage of profitable movies by month

In [None]:
df['month'] = df['release_date'].dt.strftime('%m')

In [None]:
df_prof = df[df['made_money'] == 1]  #filtering for profitable films only

In [None]:
grouped = df_prof.groupby('month').count()

In [None]:
grouped['month'] = grouped.index
grouped_all = df.groupby('month').count()
grouped['count'] = grouped_all['id']
grouped['pct_prof'] = grouped['made_money'] / grouped['count']

#### Based on this plot & background research, it looks like there are -- roughly speaking -- four main seasons, for which I'll make dummy vars below: Jan-Apr (so-so movies), May-Jul (summer blockbusters), Aug-Oct (more so-so), and Nov-Dec (major Thanksgiving/Xmas movies)

In [None]:
grouped.plot.bar(x='month', y='pct_prof')
plt.show()

## Percentage of profitable movies by genre

In [None]:
grouped2 = df_prof.groupby('genre').count()

In [None]:
grouped2['genre'] = grouped2.index
grouped_all2 = df.groupby('genre').count()
grouped2['count'] = grouped_all2['id']
grouped2['pct_prof'] = grouped2['made_money'] / grouped2['count']

#### Since it appears genre matters to profitability, will make dummy vars for genre

In [None]:
grouped2.plot.bar(x='genre', y='pct_prof')
plt.show()

## What decades are represented in the data?

In [None]:
df['year'] = df['release_date'].dt.strftime('%Y')

In [None]:
df = df.astype({"month": int, "year": int})

In [None]:
df['decade'] = (df['year']//10)*10

In [None]:
grouped3 = df.groupby('decade').count()

In [None]:
grouped3['decade'] = grouped3.index

In [None]:
grouped3.plot.bar(x='decade', y='title')
plt.show()

In [None]:
df = df.astype({"decade": int})

## Relationship btw director/writer success and profitability

In [None]:
x = df['writ_prof'].values
y = df['dir_prof'].values
z = df['profit'].values

In [None]:
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(xs=x, ys=y, zs=z)
ax.set_xlabel('Writ past profits', fontweight='bold')
ax.set_ylabel('Dir past profits', fontweight='bold')
ax.set_zlabel('Current film profit', fontweight='bold')

plt.show()

#### trying to make best-fit plane: https://gist.github.com/amroamroamro/1db8d69b4b65e8bc66a6

In [None]:
data = np.column_stack((x,y,z))

In [None]:
# regular grid covering the domain of the data
###THIS WAS TAKING FOREVER TO RUN -- TRY 10X-ing THE ARANGE INTERVALS
X,Y = np.meshgrid(np.arange(-83372628, 2649903959, 10000000), np.arange(-79627709, 6490889213, 10000000))
XX = X.flatten()
YY = Y.flatten()

order = 1    # 1: linear, 2: quadratic
if order == 1:
    # best-fit linear plane
    A = np.c_[data[:,0], data[:,1], np.ones(data.shape[0])]
    C,_,_,_ = scipy.linalg.lstsq(A, data[:,2])    # coefficients
    
    # evaluate it on grid
    Z = C[0]*X + C[1]*Y + C[2]
    
    # or expressed using matrix/vector product
    #Z = np.dot(np.c_[XX, YY, np.ones(XX.shape)], C).reshape(X.shape)

elif order == 2:
    # best-fit quadratic curve
    A = np.c_[np.ones(data.shape[0]), data[:,:2], np.prod(data[:,:2], axis=1), data[:,:2]**2]
    C,_,_,_ = scipy.linalg.lstsq(A, data[:,2])
    
    # evaluate it on a grid
    Z = np.dot(np.c_[np.ones(XX.shape), XX, YY, XX*YY, XX**2, YY**2], C).reshape(X.shape)

# plot points and fitted surface
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, alpha=0.2)
ax.scatter(data[:,0], data[:,1], data[:,2], c='r', s=50)
plt.xlabel('X')
plt.ylabel('Y')
ax.set_zlabel('Z')
ax.axis('equal')
ax.axis('tight')
plt.show()

## Make dummy vars for "season" (as discussed above)

## Make dummy vars for genre

## Saving this new iteration of the data to csv

#### Check that data types are ok and make any needed conversions:

## Next step: trying out different models...