In [1]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.contrib.learn.python.learn import learn_io

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 50
pd.options.display.float_format = '{:.3f}'.format

imdb_df = pd.read_csv("movie_metadata.csv")
imdb_df = imdb_df.reindex(np.random.permutation(imdb_df.index))

In [2]:
# drop NAN values
imdb_df.dropna(inplace = True)
print(imdb_df.shape)

# use only USA
imdb_df = imdb_df[imdb_df.country == "USA"]
print(imdb_df.shape)

(3756, 28)
(2987, 28)


In [3]:
def linear_scale(series):
  min_val = series.min()
  max_val = series.max()
  scale = (max_val - min_val) / 2.0
  return series.apply(lambda x:((x - min_val) / scale) - 1.0)

In [4]:
def preprocess_features(imdb_dataframe):
    selected_features = imdb_dataframe[
        ["color",
         #"director_name", to include soon
         "duration",
         "director_facebook_likes",
         "actor_3_facebook_likes",
         #"actor_2_name", will consider one-hot-encoding or binary encoding later
         "actor_1_facebook_likes",
         "genres",
         #"actor_1_name", will consider one-hot-encoding or binary encoding later
         "cast_total_facebook_likes",
         #"actor_3_name", will consider one-hot-encoding or binary encoding later
         "facenumber_in_poster",
         #"plot_keywords", 
         "language",
         "content_rating",
         "budget",
         "title_year",
         "actor_2_facebook_likes",
         "aspect_ratio"]]
        
    processed_features = selected_features.copy()
    
    # process "color" : label encoding : Color=1, Black and White = 0
    cleanup_color = {"color":{"Color": 1, " Black and White": 0}}
    processed_features.replace(cleanup_color, inplace=True)
    
    # process "language" : label encoding : English=1, Others = 0
    processed_features["language"] = np.where(processed_features["language"].str.contains("English"),1, 0)
    
    # process "content-rating" : one-hot-encoding
    processed_features.loc[processed_features["content_rating"].isin(['Not Rated', 'Unrated', 'Approved', 'Passed', 'M', 'X']), "content_rating"] = "NR"
    processed_features = pd.get_dummies(processed_features, columns=["content_rating"])
    
    # process "genres" : one-hot-encoding
    genres = processed_features.genres.str.split('|', expand = True).stack().reset_index(level=1, drop=True).to_frame('genres')
    genres_df = pd.get_dummies(genres, prefix='g', columns=['genres']).groupby(level=0).sum()
    processed_features = processed_features.join(genres_df)
    processed_features = processed_features.drop('genres', 1)

    # process numbers with linear scaling
    processed_features["duration"] = linear_scale(processed_features["duration"])
    processed_features["director_facebook_likes"] = linear_scale(processed_features["director_facebook_likes"])
    processed_features["actor_3_facebook_likes"] = linear_scale(processed_features["actor_3_facebook_likes"])
    processed_features["actor_1_facebook_likes"] = linear_scale(processed_features["actor_1_facebook_likes"])
    processed_features["cast_total_facebook_likes"] = linear_scale(processed_features["cast_total_facebook_likes"])
    processed_features["facenumber_in_poster"] = linear_scale(processed_features["facenumber_in_poster"])
    processed_features["budget"] = linear_scale(processed_features["budget"])
    processed_features["title_year"] = linear_scale(processed_features["title_year"])
    processed_features["actor_2_facebook_likes"] = linear_scale(processed_features["actor_2_facebook_likes"])
    processed_features["aspect_ratio"] = linear_scale(processed_features["aspect_ratio"])
    
    return processed_features


def preprocess_targets(imdb_dataframe):
    output_targets = pd.DataFrame()
    
    # plain gross return (maybe inflation is already explained by feature "year")
    output_targets["gross"] = (imdb_dataframe["gross"] / 1000000.0)
    
    # credit to Henry
    output_targets["adjusted_profit"] = ((imdb_dataframe["gross"] - imdb_dataframe["budget"]) * 1.04**(2016 - imdb_dataframe['title_year'])/1000000.0)
    
    output_targets["imdb_score"] = imdb_dataframe["imdb_score"]
    
    return output_targets

In [5]:
imdb_processed_features = preprocess_features(imdb_df)
imdb_processed_targets = preprocess_targets(imdb_df)

training_examples = imdb_processed_features.head(2200)
training_targets = imdb_processed_targets.head(2200)
validation_examples = imdb_processed_features.tail(787)
validation_targets = imdb_processed_targets.tail(787)

In [6]:
imdb_processed_features

Unnamed: 0,color,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,facenumber_in_poster,language,budget,title_year,...,g_Horror,g_Music,g_Musical,g_Mystery,g_Romance,g_Sci-Fi,g_Sport,g_Thriller,g_War,g_Western
2016,1,-0.440,-0.970,-0.964,-0.944,-0.892,-0.907,1,-0.907,0.356,...,0,0,0,0,1,0,0,0,0,0
565,1,-0.584,-1.000,-0.939,-0.963,-0.953,-0.953,1,-0.540,0.586,...,0,0,0,1,0,0,0,1,0,0
2225,1,-0.509,-0.961,-0.826,-0.944,-0.897,-0.814,1,-0.867,0.816,...,0,0,0,0,0,0,0,1,0,0
2697,1,-0.522,-0.951,-0.969,-0.998,-0.993,-0.860,1,-0.900,0.908,...,0,0,0,0,0,0,0,0,0,0
1225,1,-0.529,-0.982,-0.966,-0.998,-0.992,-0.907,1,-0.733,0.655,...,0,0,0,0,0,0,0,1,1,0
524,1,-0.666,-0.997,-0.984,-0.997,-0.995,-1.000,1,-0.500,0.747,...,0,0,0,0,0,0,0,0,0,0
4594,1,-0.604,-0.979,-0.937,-0.997,-0.988,-1.000,1,-0.994,0.839,...,1,0,0,0,0,0,0,0,0,0
131,1,-0.597,-0.995,-0.939,-0.947,-0.939,-1.000,1,-0.000,0.816,...,0,0,0,0,0,0,0,0,0,0
1010,1,-0.304,-0.967,-0.974,-0.966,-0.961,-0.953,1,-0.667,1.000,...,0,0,0,0,0,0,0,0,1,0
2576,1,-0.488,-0.998,-0.981,-0.998,-0.996,-1.000,1,-0.900,0.632,...,0,1,0,0,0,0,0,0,0,0


In [7]:
imdb_processed_targets

Unnamed: 0,gross,adjusted_profit,imdb_score
2016,34.700,62.073,7.700
565,55.585,-27.176,5.900
2225,26.416,8.780,6.800
2697,15.609,0.712,5.400
1225,59.069,34.342,6.400
524,193.137,181.866,6.900
4594,0.101,-1.052,6.400
131,114.054,-49.195,6.900
1010,20.390,-29.610,6.700
2576,4.734,-19.228,6.000


In [8]:
def train_linear_model(
    target,
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
    """Trains a linear regression model.

    In addition to training, this function also prints training progress information,
    as well as a plot of the training and validation loss over time.

    Args:
    learning_rate: A `float`, the learning rate.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    ...

    Returns:
    A `LinearRegressor` object trained on the training data.
    """

    periods = 10
    steps_per_period = steps / periods

    # Create a linear regressor object.
    feature_columns = set([tf.contrib.layers.real_valued_column(my_feature) for my_feature in training_examples])
    linear_regressor = tf.contrib.learn.LinearRegressor(
      feature_columns=feature_columns,
      optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
      gradient_clip_norm=5.0
    )

    # Create input functions
    training_input_fn = learn_io.pandas_input_fn(
      x=training_examples, y=training_targets[target],
      num_epochs=None, batch_size=batch_size)
    predict_training_input_fn = learn_io.pandas_input_fn(
      x=training_examples, y=training_targets[target],
      num_epochs=1, shuffle=False)
    predict_validation_input_fn = learn_io.pandas_input_fn(
      x=validation_examples, y=validation_targets[target],
      num_epochs=1, shuffle=False)

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print("Training model...")
    print("RMSE (on training data):")
    training_rmse = []
    validation_rmse = []
    for period in range (0, periods):
        # Train the model, starting from the prior state.
        linear_regressor.fit(
            input_fn=training_input_fn,
            steps=steps_per_period,
        )
        # Take a break and compute predictions.
        training_predictions = list(linear_regressor.predict(input_fn=predict_training_input_fn))
        validation_predictions = list(linear_regressor.predict(input_fn=predict_validation_input_fn))
        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, training_targets[target]))
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, validation_targets[target]))
        # Occasionally print the current loss.
        print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")


    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()

    return linear_regressor

In [9]:
_ = train_linear_model(
    target='gross',
    learning_rate=0.001,
    steps=100,
    batch_size=200,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

Training model...
RMSE (on training data):
  period 00 : 94.87
  period 01 : 94.77
  period 02 : 94.66
  period 03 : 94.56
  period 04 : 94.46
  period 05 : 94.36
  period 06 : 94.26
  period 07 : 94.15
  period 08 : 94.05
  period 09 : 93.95
Model training finished.


In [10]:
_ = train_linear_model(
    target='adjusted_profit',
    learning_rate=0.0001,
    steps=500,
    batch_size=20,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

Training model...
RMSE (on training data):
  period 00 : 142.95
  period 01 : 142.93
  period 02 : 142.91
  period 03 : 142.89
  period 04 : 142.87
  period 05 : 142.85
  period 06 : 142.83
  period 07 : 142.82
  period 08 : 142.80
  period 09 : 142.78
Model training finished.
