# Import movie files

In [None]:
movie_file = 'movies2010_2016.csv'
plot_file = 'plots2010_2016.csv'
actor_file = 'actors2010_2016.csv'
director_file = 'directors2010_2016.csv'

In [None]:
import pandas as pd
import datetime
import numpy as np
import re

In [None]:
movie_df = pd.read_csv(movie_file)
plot_df = pd.read_csv(plot_file)
actor_df = pd.read_csv(actor_file)
director_df = pd.read_csv(director_file)

In [None]:
print movie_df.columns.values
print "---------------------------------------------------------------------------"
print movie_df[:2]

In [None]:
print plot_df.columns.values
print "---------------------------------------------------------------------------"
print plot_df[:2]

In [None]:
print actor_df.columns.values
print "---------------------------------------------------------------------------"
print actor_df[:2]

In [None]:
print director_df.columns.values
print "---------------------------------------------------------------------------"
print director_df[:2]

In [None]:
movie_list_df = movie_df.merge(plot_df, on=[u'site'])

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

# Clean data

### Convert revenues to numeric values

In [None]:
revenue_arr = []
for index, row in movie_list_df.iterrows():
    try:
        revenue = float(row['revenues'])
    except:
        revenue = np.nan
    revenue_arr.append(revenue)

movie_list_df['revenues_clean'] = revenue_arr

### Convert length to numeric values

In [None]:
length_arr = []
for index, row in movie_list_df.iterrows():
    try:
        length = re.sub("[^0-9]+", "", row['length'])
        length = int(length)
    except:
        length = np.nan
    length_arr.append(length)

movie_list_df['length_clean'] = length_arr

### Replace nulls in text columns with empty string (otherwise sometimes causes error)

In [None]:
movie_list_df['based_on'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['cinematography'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['country'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['director'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['plot'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['starring'].replace(to_replace=np.nan, value="", inplace=True)
movie_list_df['studio'].replace(to_replace=np.nan, value="", inplace=True)

### View results

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

# Derive additional features

### Add release week and day of week

In [None]:
week_arr = []
day_of_week_arr = []
for index, row in movie_list_df.iterrows():
    try:
        release_year = int(row['release_year'])
        release_month = int(row['release_month'])
        release_day = int(row['release_day'].split("-")[0])
        release_week = datetime.date(release_year, release_month, release_day).isocalendar()[1]
        release_day_of_week = datetime.datetime.weekday(datetime.datetime.strptime(str(release_year)+"-"+str(release_month)+"-"+str(release_day), "%Y-%m-%d"))
    except:
        release_week = np.nan   
        release_day_of_week = np.nan
    day_of_week_arr.append(release_day_of_week)
    week_arr.append(release_week)

movie_list_df['release_week'] = week_arr
movie_list_df['release_day_of_week'] = day_of_week_arr

### Split director, actor, etc. arrays in to individual features
Retain top n results

Actors:

In [None]:
no_top_actors = 10
actor_col_arr = ["actor_" + str(i) for i in range(no_top_actors)]
actor_arr = []
for index, row in movie_list_df.iterrows():
    actor_list = row['starring'].split()[:no_top_actors]
    actor_arr.append(actor_list)

actor_arr = np.transpose(actor_arr)
    
for topic in enumerate(actor_col_arr):
#    movie_list_df[topic[1]] = actor_arr[topic[0]]

### View results

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

# Decompose plots into topics using Non-Negative Matrix Factorization (NNMF), Latent Dirichlet Allocation (LDA)

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
no_word_features = 1000
no_topics = 30

## Clean up plots

In [None]:
print movie_list_df['plot'][1]
print "---------------------------------------------------------------------------"
print movie_list_df['plot'][4]

### Remove special characters

In [None]:
movie_list_df['plot_clean'] = movie_list_df['plot'].replace(to_replace='\[[0-9]+\]', value=" ", regex=True)
movie_list_df['plot_clean'].replace(to_replace='[^A-Za-z0-9]+', value=" ", inplace=True, regex=True)

### Use NLTK to remove proper nouns

### View results

In [None]:
print movie_list_df['plot_clean'][1]
print "---------------------------------------------------------------------------"
print movie_list_df['plot_clean'][4]

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

## Fit NNMF model

### Vectorize plots for NNMF using tf-idf
Max number of features is number of words for the "bag of words"

In [None]:
nnmf_tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_word_features, stop_words='english')
nnmf_tfidf = nnmf_tfidf_vectorizer.fit_transform(movie_list_df['plot_clean'])
nnmf_tfidf_feature_names = nnmf_tfidf_vectorizer.get_feature_names()
print nnmf_tfidf_feature_names

### Run NNMF model

In [None]:
nnmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(nnmf_tfidf)
nnmf_W = nnmf_model.transform(nnmf_tfidf)
nnmf_H = nnmf_model.components_

### Add full list of NNMF topic scores to dataframe

In [None]:
movie_list_df['nnmf_topic_scores'] = nnmf_W.tolist()

### Add top n NNMF topics to dataframe

In [None]:
no_top_n_nnmf_topics = 4
nnmf_topic_arr = ["nnmf_topic_" + str(i) for i in range(no_top_n_nnmf_topics)]
top_n_nnmf_topic_arr = []
for index, row in movie_list_df.iterrows():
    top_n_nnmf_topic_arr.append(np.array(row['nnmf_topic_scores']).argsort()[-1*no_top_n_nnmf_topics:][::-1])

top_n_nnmf_topic_arr = np.transpose(top_n_nnmf_topic_arr)
    
for topic in enumerate(nnmf_topic_arr):
    movie_list_df[topic[1]] = top_n_nnmf_topic_arr[topic[0]]

### View results

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

## Fit LDA Model

### Vectorize plots for LDA using tf
Max number of features is number of words for the "bag of words".

LDA can only use raw term counts for LDA because it is a probabilistic graphical model


In [None]:
lda_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_word_features, stop_words='english')
lda_tf = lda_tf_vectorizer.fit_transform(movie_list_df['plot_clean'])
lda_tf_feature_names = lda_tf_vectorizer.get_feature_names()
print lda_tf_feature_names

### Run LDA model

In [None]:
lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(lda_tf)
lda_W = lda_model.transform(lda_tf)
lda_H = lda_model.components_

### Add LDA topics to main dataframe

In [None]:
movie_list_df['lda_topic_scores'] = lda_W.tolist()

### Add top n LDA topics to dataframe

In [None]:
no_top_n_lda_topics = 4
lda_topic_arr = ["lda_topic_" + str(i) for i in range(no_top_n_lda_topics)]
top_n_lda_topic_arr = []
for index, row in movie_list_df.iterrows():
    top_n_lda_topic_arr.append(np.array(row['lda_topic_scores']).argsort()[-1*no_top_n_lda_topics:][::-1])

top_n_lda_topic_arr = np.transpose(top_n_lda_topic_arr)
    
for topic in enumerate(lda_topic_arr):
    movie_list_df[topic[1]] = top_n_lda_topic_arr[topic[0]]

### View results

In [None]:
print movie_list_df.columns.values
print "---------------------------------------------------------------------------"
print movie_list_df[:2]

# Display plot model results
Will display top associated words, top movies for each topic

In [None]:
no_top_words = 50
no_top_documents = 5

def display_topics(H, W, feature_names, titles, plots, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print "\nMovie: " + titles[doc_index]
            print "Plot:\n" + plots[doc_index] + "\n"
        print "---------------------------------------------------------------------------"

### NNMF

In [None]:
display_topics(nnmf_H, nnmf_W, nnmf_tfidf_feature_names, movie_list_df['title'], movie_list_df['plot'], no_top_words, no_top_documents)

### LDA

In [None]:
display_topics(lda_H, lda_W, lda_tf_feature_names, movie_list_df['title'], movie_list_df['plot'], no_top_words, no_top_documents)

# Build revenue prediction model

## Create model input array

### Create numpy array with specific columns from pandas dataframe

In [None]:
movie_prediction_features = ['length_clean', 'release_week', 'release_day_of_week']
movie_prediction_features += lda_topic_arr
revenue_column = ['revenues_clean'] 

revenue_actl = np.array(movie_list_df[revenue_column]).flatten()
movie_feature_arr = np.array(movie_list_df[movie_prediction_features])

In [None]:
print revenue_actl

In [None]:
print movie_prediction_features
print movie_feature_arr

### Split data set into training, validation, and test data sets

In [None]:
training_data, training_revenue = movie_feature_arr[:600], revenue_actl[:600]
validation_data, validation_revenue = movie_feature_arr[601:1000], revenue_actl[601:1000]
test_data, test_revenue = movie_feature_arr[1001:], revenue_actl[1001:]

## Create function to evaluate results

### Squared Error Loss

In [None]:
def GetSquaredErrorLoss(revenue_actl, revenue_pred):
    return sum((np.nan_to_num(revenue_actl - revenue_pred))**2)/(1.0*len(revenue_actl))

In [None]:
print GetSquaredErrorLoss(revenue_actl, revenue_actl)

## Predict results of validation data set using training data

Predict using a variety of models

In [None]:
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR

### Linear Regression

Train model using "train" data set

In [None]:
lm = LinearRegression()
lm.fit(training_data, training_revenue)

Predict revenues of validation data set

In [None]:
lm_validation_revenue_pred = np.round(lm.predict(validation_data))
lm_validation_revenue_pred[lm_validation_revenue_pred < 0] = 0
print lm_validation_revenue_pred

Evaluate results

In [None]:
lm_error = GetSquaredErrorLoss(validation_revenue, lm_validation_revenue_pred)
print lm_error

### K Nearest Neighbors

### Decision Tree

### Random Forest Classifier

### Decision Tree Regressor

### Random Forest Regressor

### AdaBoost Regressor

### Bagging Regressor

### Gradient Boosting Regressor

### Stochastic Gradient Descent Regressor

### Support Vector Machine Regressor