In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier
from keras import backend as K
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda
from keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
data_file = pd.read_csv('MOVIES_WITHOUT_PTT_TIME_ASC.csv', encoding='cp950')
data_length = len(data_file)

In [3]:
country = []
genre = []
date = []
for i in range(data_length):
    t_country = str(data_file['COUNTRY'][i]).split(',')
    t_genre = str(data_file['IMDB_GENRE'][i]).split(',')
    t_date = str(data_file['DATE_TW'][i]).split('/')
    t_date = [ int(d) for d in t_date]
    country.append(t_country)
    genre.append(t_genre)
    date.append(t_date)

In [4]:
mlb = MultiLabelBinarizer()
country = mlb.fit_transform(country)
genre = mlb.fit_transform(genre)

In [5]:
date = np.array(date)
runtime = np.array(data_file['IMDB_RUNTIME']).reshape(data_length, 1)
dir_detail = np.array(data_file[['DIRECTOR_WINS', 'DIRECTOR_NOMINATIONS', 
                                 'DIRECTOR_RATINGS']])
star_detail = np.array(data_file[['STAR_1_WINS', 'STAR_1_NOMINATIONS', 
                                  'STAR_1_RATINGS', 'STAR_2_WINS', 
                                  'STAR_2_NOMINATIONS', 'STAR_2_RATINGS', 
                                  'STAR_3_WINS', 'STAR_3_NOMINATIONS', 
                                  'STAR_3_RATINGS']])
yahoo = np.array(data_file[['YAHOO_EVALUATION', 'YAHOO_VOTER']])
PTT = np.array(data_file[['PTT_ARTICLE', 'PTT_PUSH', 'PTT_ARROW', 
                          'PTT_PULL', 'PTT_REPLY']])
youtube = np.array(data_file[['YOUTUBE_VIEW', 'YOUTUBE_LIKE', 'YOUTUBE_DISLIKE']])

In [13]:
x_train = np.concatenate((country, genre, runtime, dir_detail, star_detail, 
                          yahoo, PTT, youtube), axis=-1)
rating = np.array(data_file['IMDB_RATING']).reshape(data_length, 1)

In [14]:
mean = np.mean(x_train, axis=0)
sigma = np.std(x_train, axis=0)
x_train = (x_train-mean)/(sigma + 1e-20)

In [15]:
# np.save('x_train', x_train)
# np.save('y_train', y_train)

In [22]:
model = Sequential()
model.add(Dense(32, input_shape=(x_train.shape[1],), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1, activation='relu'))
model.add(Lambda(lambda x: x + K.constant(6.5558, dtype=K.floatx())))

In [23]:
def rmse(y_true, y_pred):
    K.clip(y_pred, 1.0, 5.0)
    return K.sqrt(K.mean(K.pow(y_true - y_pred, 2)))

In [24]:
callbacks = []
callbacks.append(ModelCheckpoint('model_' + str(i) + '.h5',
                                 monitor='val_loss', save_best_only=True, period=1))
callbacks.append(EarlyStopping(monitor='val_loss', patience=3))
model.compile(loss='mse', optimizer='adam', metrics=['mse'])
model.fit(x_train, rating, batch_size=5, validation_split=0.1, 
          epochs=100, callbacks=callbacks)

Train on 177 samples, validate on 20 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


<keras.callbacks.History at 0x11f90306a58>

In [10]:
y_train = []
for r in rating:
    temp = 0
    if r >= 7:
        temp = 1
    elif r >= 6.2:
        temp = 2
    else:
        temp = 3
    y_train.append(temp)
y_train = np.array(y_train)

In [11]:
x_valid = x_train[:20]
y_valid = y_train[:20]

x_train = x_train[20:]
y_train = y_train[20:]

In [12]:
clf = GradientBoostingClassifier(n_estimators=40, min_samples_split=40, min_samples_leaf=3,
                             max_leaf_nodes=15, max_depth=5, random_state=15)
clf.fit(x_train, y_train)
clf.score(x_valid, y_valid)

0.5