# Movie Revenue Prediction

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd

import json
import csv

import os
import requests

import tensorflow as tf

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

## Gathering Data

In [2]:
#retrieve test data from 2015-2019
r = requests.get('https://api.themoviedb.org/3/discover/movie?api_key=9447cd662f95276c1c7a053a8d830e7a&language=en-US&region=US&sort_by=revenue.desc&include_adult=false&include_video=false&release_date.gte=2015-01-01&release_date.lte=2019-12-31&with_release_type=3&vote_count.gte=100')
data = r.json()
pages = data['total_pages']
#create a list of all movie IDs from the release range
movies = []
for i in range(1,pages+1):
    r = requests.get('https://api.themoviedb.org/3/discover/movie?api_key=9447cd662f95276c1c7a053a8d830e7a&language=en-US&region=US&sort_by=revenue.desc&include_adult=false&include_video=false&release_date.gte=2015-01-01&release_date.lte=2019-12-31&with_release_type=3&vote_count.gte=100&page=' + str(i))
    data = r.json()
    results = len(data['results'])
    for j in range(results):
        movies.append(data['results'][j]['id'])

In [3]:
#create a list of dictionaries to convert into a dataframe
data_frame = []
for i in range(len(movies)):
    movie_id = str(movies[i])

    #retrieve movie details
    r = requests.get('https://api.themoviedb.org/3/movie/' + movie_id + '?api_key=9447cd662f95276c1c7a053a8d830e7a&language=en-US')
    data = r.json()
    movie_budget = data['budget']
    movie_production = np.nan
    #only consider the primary production company for the region
    if len(data['production_companies']):
        movie_production = data['production_companies'][0]['id']
    movie_release = data['release_date']
    movie_revenue = data['revenue']
    movie_runtime = data['runtime']
    movie_genre = data['genres'][0]['id']
    movie_popularity = data['popularity']
    movie_votecount = data['vote_count']
    movie_voteavg = data['vote_average']

    #retrieve keywords associated with the movie
    r = requests.get('https://api.themoviedb.org/3/movie/' + movie_id + '/keywords?api_key=9447cd662f95276c1c7a053a8d830e7a')
    keywords = r.json()
    keyword = []
    #only consider three most relevant keywords
    if len(keywords['keywords']) < 2:
        count = 0
        for i in range(len(keywords['keywords'])):
            keyword.append(keywords['keywords'][i]['id'])
            count += 1
        while count < 2:
            keyword.append(np.nan)
            count += 1
    else:
        for i in range(len(keywords['keywords'])):
            keyword.append(keywords['keywords'][i]['id'])

            
    #create dictionary for the movie 
    movie_dict = {'id': movie_id, 'genre': movie_genre, 'keyword1': keyword[0], 'keyword2': keyword[1], 'budget': movie_budget, 'production': movie_production, 'runtime': movie_runtime, 'popularity': movie_popularity, 'vote_count': movie_votecount, 'vote_average': movie_voteavg, 'revenue': movie_revenue}
    data_frame.append(movie_dict)
    
df = pd.DataFrame(data_frame)
#drop null and incorrect values
df = df.dropna().loc[df['revenue']>0].loc[df['budget']>0]
#normalize values
#categorize revenues
df['revenue'] = df['revenue'].apply(lambda x: x//1000000)
df['budget'] = df['budget'].apply(lambda x: x//1000000)
df['revenue'] = pd.cut(df['revenue'], bins=[-1, 2, 15, 40, 95, 250, 10000], labels = ['0', '1', '2', '3', '4', '5'])
#save pandas dataframe to csv
df = df.sample(frac=1).reset_index(drop=True)
columns = ['genre', 'keyword1', 'keyword2', 'production']
for col in columns:
    df[col] = df[col].astype(str)
df = pd.get_dummies(df, columns=columns)
df.to_csv('data.csv', index = None, header=True)

## SVM 

In [4]:
#SVM Classification
data = pd.read_csv("data.csv")

#creating the train-test split
y = data['revenue']
X = data.drop(['revenue'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 87) 

#training and testing the SVM
svm_model = SVC(kernel='rbf', C=10)
svm_model.fit(X_train, y_train) 
svm_predictions = svm_model.predict(X_test)

#model accuracy for X_test   
bingo_accuracy = (svm_model.score(X_test, y_test))*100
aphr = svm_predictions - y_test.to_numpy()
one_away = [1 if abs(x) == 1 or x == 0 else 0 for x in aphr]
one_away_accuracy = (sum(one_away)/len(one_away))*100
print("Bingo APHR with SVC = " + str(round(bingo_accuracy)) + "%")
print("1-Away APHR with SVC = " + str(round(one_away_accuracy)) + "%")

#save results as csv
results_dict = {'actual': y_test.to_numpy(), 'prediction': svm_predictions}
df = pd.DataFrame(results_dict)
df = df.replace([0, 1, 2, 3, 4, 5], ['<2M', '2M-15M', '15M-40M', '40M-95M', '95M-250M', '>250M'])
df.to_csv('svm_results.csv', index = None, header=True)

Bingo APHR with SVC = 28.0%
1-Away APHR with SVC = 59%


## DNN

In [7]:
#DNN classification
data = pd.read_csv("data.csv")

#manually creating the train-test split
test = data.iloc[:100, :]
train = data.iloc[100:, :]

#converting the pandas dataframe to a tensorflow dataset
y_train = train.pop('revenue')
dataset = tf.data.Dataset.from_tensor_slices((train.values, y_train.values))
train_dataset = dataset.shuffle(len(train)).batch(1)
y_test = test.pop('revenue')
data = tf.data.Dataset.from_tensor_slices((test.values, y_test.values))
test_dataset = data.shuffle(len(test)).batch(1)

#defining the model
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(32, activation='tanh'),
  tf.keras.layers.Dense(64, activation='tanh'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(32, activation='tanh'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(6)
])
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['sparse_categorical_accuracy'])
early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

#train the model and evaluate the accuracy
model.fit(train_dataset, epochs=50, callbacks=[early_stopping_monitor])
predict = model.predict(test_dataset)

#model accuracy for X_test 
y_test = []
for feat, targ in test_dataset:
    i = int(targ.numpy())
    y_test.append(i)
y_test = np.array(y_test)
dnn_predictions = np.argmax(predict, axis=1)
aphr = dnn_predictions - y_test
bingo = [1 if x == 0 else 0 for x in aphr]
bingo_accuracy = (sum(bingo)/len(bingo))*100
one_away = [1 if abs(x) == 1 or x == 0 else 0 for x in aphr]
one_away_accuracy = (sum(one_away)/len(one_away))*100
print("Bingo APHR with DNN = " + str(round(bingo_accuracy)) + "%")
print("1-Away APHR with DNN = " + str(round(one_away_accuracy)) + "%")

#save results as csv
results_dict = {'actual': y_test, 'prediction': dnn_predictions}
df = pd.DataFrame(results_dict)
df = df.replace([0, 1, 2, 3, 4, 5], ['<2M', '2M-15M', '15M-40M', '40M-95M', '95M-250M', '>250M'])
df.to_csv('dnn_results.csv', index = None, header=True)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Bingo APHR with DNN = 19%
1-Away APHR with DNN = 65%
