In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold
from keras import backend as K
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda
from keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from keras.utils import np_utils

Using TensorFlow backend.


In [5]:
# def read_data(adjusted):
#     if adjusted:
#         data_file = pd.read_csv('MOVIES_ADJUSTED.csv', encoding='cp950')
#         return data_file
#     else:
#         data_file = pd.read_csv('MOVIES_WITHOUT_ADJUSTED.csv', encoding='cp950')
#         return data_file
def preprocessing(data_file):
    country = []
    genre = []
    date = []
    for i in range(data_length):
        t_country = str(data_file['COUNTRY'][i]).split(',')
        t_genre = str(data_file['IMDB_GENRE'][i]).split(',')
        t_date = str(data_file['DATE_TW'][i]).split('/')
        t_date = [ int(d) for d in t_date]
        country.append(t_country)
        genre.append(t_genre)
        date.append(t_date)
    mlb = MultiLabelBinarizer()
    country = mlb.fit_transform(country)
    genre = mlb.fit_transform(genre)
    date = np.array(date)
    runtime = np.array(data_file['IMDB_RUNTIME']).reshape(data_length, 1)
    dir_detail = np.array(data_file[['DIRECTOR_WINS', 'DIRECTOR_NOMINATIONS', 
                                     'DIRECTOR_RATINGS']])
    star_detail = np.array(data_file[['STAR_1_WINS', 'STAR_1_NOMINATIONS', 
                                      'STAR_1_RATINGS', 'STAR_2_WINS', 
                                      'STAR_2_NOMINATIONS', 'STAR_2_RATINGS', 
                                      'STAR_3_WINS', 'STAR_3_NOMINATIONS', 
                                      'STAR_3_RATINGS']])
    yahoo = np.array(data_file[['YAHOO_EVALUATION', 'YAHOO_VOTER']])
    PTT = np.array(data_file[['PTT_ARTICLE', 'PTT_PUSH', 'PTT_ARROW', 
                              'PTT_PULL', 'PTT_REPLY']])
    youtube = np.array(data_file[['YOUTUBE_VIEW', 'YOUTUBE_LIKE', 'YOUTUBE_DISLIKE']])
    x_train = np.concatenate((country, genre, runtime, dir_detail, star_detail, 
                          yahoo, PTT, youtube), axis=-1)
    rating = np.array(data_file['IMDB_RATING']).reshape(data_length, 1)
    return x_train, rating

In [14]:
# read training data
x_train, y_train = np.load('x_train.npy'), np.load('y_train.npy')
n_features = x_train.shape[1]

In [15]:
# Normalization
mean = np.mean(x_train, axis=0)
sigma = np.std(x_train, axis=0)
x_train = (x_train-mean)/(sigma + 1e-20)

In [16]:
# read testing data
x_test, y_test = np.load('x_test.npy'), np.load('y_test.npy')
x_test = (x_test-mean)/(sigma + 1e-20)

In [19]:
def rmse(y_true, y_pred):
    K.clip(y_pred, 1.0, 10.0)
    return K.sqrt(K.mean(K.pow(y_true - y_pred, 2)))
def create_model(dnn):
    model = Sequential()
    model.add(Dense(32, input_shape=(n_features,), activation='tanh'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    for units in dnn:
        model.add(Dense(units, activation='tanh'))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
    model.add(Dense(1, activation='tanh'))
    model.add(Lambda(lambda x: x + K.constant(6.5558, dtype=K.floatx())))
    return model

In [29]:
kfold = KFold(n_splits=10, shuffle=True, random_state=5)
k = 0
test_score = []
for train_index, valid_index in kfold.split(x_train):
    k += 1
    X, X_V = x_train[train_index], x_train[valid_index]
    Y, Y_V = y_train[train_index], y_train[valid_index]
    
    # DNN
    dnn = [32, 32]
    model = create_model(dnn)
    callbacks = []
    callbacks.append(EarlyStopping(monitor='val_loss', patience=3))
    model.compile(loss='mse', optimizer='adam', metrics=[rmse])
    model.fit(X, Y, batch_size=32, validation_data=(X_V, Y_V), 
              epochs=100, callbacks=callbacks)
    test_score.append(model.predict(x_test))

Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Train on 160 samples, validate on 17 samples
Epoch 1/100
Epoch 2/100


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Train on 160 samples, validate on 17 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Train on 160 samples, validate on 17 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100


In [30]:
# test accuracy
final_scores = np.zeros((len(x_test), 1))
for i in range(k):
    final_scores += test_score[i]
test_pred = final_scores / k
test_loss = [ np.abs(y_test[i] - test_pred[i])  for i in range(len(y_test))]
test_loss = np.sum(test_loss) / len(y_test)
print('test_acc: ' + str(test_loss))

test_acc: 0.757745652199
