In [17]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.contrib.learn.python.learn import learn_io

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 50
pd.options.display.float_format = '{:.3f}'.format

imdb_df = pd.read_csv("movie_metadata.csv")
imdb_df = imdb_df.reindex(np.random.permutation(imdb_df.index))

In [18]:
# drop NAN values
imdb_df.dropna(inplace = True)
print(imdb_df.shape)

# use only USA
imdb_df = imdb_df[imdb_df.country == "USA"]
print(imdb_df.shape)

(3756, 28)
(2987, 28)


In [19]:
def linear_scale(series):
  min_val = series.min()
  max_val = series.max()
  scale = (max_val - min_val) / 2.0
  return series.apply(lambda x:((x - min_val) / scale) - 1.0)

In [20]:
def preprocess_features(imdb_dataframe):
    selected_features = imdb_dataframe[
        ["color",
         #"director_name", to include soon
         "duration",
         "director_facebook_likes",
         "actor_3_facebook_likes",
         #"actor_2_name", will consider one-hot-encoding or binary encoding later
         "actor_1_facebook_likes",
         #"genres", to include soon
         #"actor_1_name", will consider one-hot-encoding or binary encoding later
         "cast_total_facebook_likes",
         #"actor_3_name", will consider one-hot-encoding or binary encoding later
         "facenumber_in_poster",
         #"plot_keywords", 
         "language",
         "content_rating",
         "budget",
         "title_year",
         "actor_2_facebook_likes",
         "aspect_ratio"]]
        
    processed_features = selected_features.copy()
    
    # process "color" : label encoding : Color=1, Black and White = 0
    cleanup_color = {"color":{"Color": 1, " Black and White": 0}}
    processed_features.replace(cleanup_color, inplace=True)
    
    # process "language" : label encoding : English=1, Others = 0
    processed_features["language"] = np.where(processed_features["language"].str.contains("English"),1, 0)
    
    # process "content-rating" : one-hot-encoding
    processed_features.loc[processed_features["content_rating"].isin(['Not Rated', 'Unrated', 'Approved', 'Passed', 'M', 'X']), "content_rating"] = "NR"
    processed_features = pd.get_dummies(processed_features, columns=["content_rating"])

    # process numbers with linear scaling
    processed_features["duration"] = linear_scale(processed_features["duration"])
    processed_features["director_facebook_likes"] = linear_scale(processed_features["director_facebook_likes"])
    processed_features["actor_3_facebook_likes"] = linear_scale(processed_features["actor_3_facebook_likes"])
    processed_features["actor_1_facebook_likes"] = linear_scale(processed_features["actor_1_facebook_likes"])
    processed_features["cast_total_facebook_likes"] = linear_scale(processed_features["cast_total_facebook_likes"])
    processed_features["facenumber_in_poster"] = linear_scale(processed_features["facenumber_in_poster"])
    processed_features["budget"] = linear_scale(processed_features["budget"])
    processed_features["title_year"] = linear_scale(processed_features["title_year"])
    processed_features["actor_2_facebook_likes"] = linear_scale(processed_features["actor_2_facebook_likes"])
    processed_features["aspect_ratio"] = linear_scale(processed_features["aspect_ratio"])
    
    return processed_features


def preprocess_targets(imdb_dataframe):
    output_targets = pd.DataFrame()
    
    # plain gross return (maybe inflation is already explained by feature "year")
    output_targets["gross"] = (imdb_dataframe["gross"] / 1000000.0)
    
    # credit to Henry
    output_targets["adjusted_profit"] = ((imdb_dataframe["gross"] - imdb_dataframe["budget"]) * 1.04**(2016 - imdb_dataframe['title_year'])/1000000.0)
    
    output_targets["imdb_score"] = imdb_dataframe["imdb_score"]
    
    return output_targets

In [21]:
imdb_processed_features = preprocess_features(imdb_df)
imdb_processed_targets = preprocess_targets(imdb_df)

training_examples = imdb_processed_features.head(2200)
training_targets = imdb_processed_targets.head(2200)
validation_examples = imdb_processed_features.tail(787)
validation_targets = imdb_processed_targets.tail(787)

In [22]:
def train_linear_model(
    target,
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
    """Trains a linear regression model.

    In addition to training, this function also prints training progress information,
    as well as a plot of the training and validation loss over time.

    Args:
    learning_rate: A `float`, the learning rate.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    ...

    Returns:
    A `LinearRegressor` object trained on the training data.
    """

    periods = 10
    steps_per_period = steps / periods

    # Create a linear regressor object.
    feature_columns = set([tf.contrib.layers.real_valued_column(my_feature) for my_feature in training_examples])
    linear_regressor = tf.contrib.learn.LinearRegressor(
      feature_columns=feature_columns,
      optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
      gradient_clip_norm=5.0
    )

    # Create input functions
    training_input_fn = learn_io.pandas_input_fn(
      x=training_examples, y=training_targets[target],
      num_epochs=None, batch_size=batch_size)
    predict_training_input_fn = learn_io.pandas_input_fn(
      x=training_examples, y=training_targets[target],
      num_epochs=1, shuffle=False)
    predict_validation_input_fn = learn_io.pandas_input_fn(
      x=validation_examples, y=validation_targets[target],
      num_epochs=1, shuffle=False)

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print("Training model...")
    print("RMSE (on training data):")
    training_rmse = []
    validation_rmse = []
    for period in range (0, periods):
        # Train the model, starting from the prior state.
        linear_regressor.fit(
            input_fn=training_input_fn,
            steps=steps_per_period,
        )
        # Take a break and compute predictions.
        training_predictions = list(linear_regressor.predict(input_fn=predict_training_input_fn))
        validation_predictions = list(linear_regressor.predict(input_fn=predict_validation_input_fn))
        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, training_targets[target]))
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, validation_targets[target]))
        # Occasionally print the current loss.
        print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")


    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()

    return linear_regressor

In [23]:
_ = train_linear_model(
    target='gross',
    learning_rate=0.001,
    steps=100,
    batch_size=200,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

Training model...
RMSE (on training data):
  period 00 : 95.13
  period 01 : 95.04
  period 02 : 94.94
  period 03 : 94.84
  period 04 : 94.74
  period 05 : 94.65
  period 06 : 94.55
  period 07 : 94.45
  period 08 : 94.35
  period 09 : 94.26
Model training finished.


In [25]:
_ = train_linear_model(
    target='adjusted_profit',
    learning_rate=0.0001,
    steps=500,
    batch_size=20,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

Training model...
RMSE (on training data):
  period 00 : 162.83
  period 01 : 162.81
  period 02 : 162.79
  period 03 : 162.78
  period 04 : 162.76
  period 05 : 162.74
  period 06 : 162.72
  period 07 : 162.71
  period 08 : 162.69
  period 09 : 162.67
Model training finished.
