In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from keras import backend as K
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda
from keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
# def read_data(adjusted):
#     if adjusted:
#         data_file = pd.read_csv('train_adjusted.csv', encoding='cp950')
#         return data_file
#     else:
#         data_file = pd.read_csv('MOVIES_WITHOUT_ADJUSTED.csv', encoding='cp950')
#         return data_file
def preprocessing(data_file):
    country = []
    genre = []
    date = []
    data_length = len(data_file)
    for i in range(data_length):
        t_country = str(data_file['COUNTRY'][i]).split(',')
        t_genre = str(data_file['IMDB_GENRE'][i]).split(',')
        t_date = str(data_file['DATE_TW'][i]).split('/')
        t_date = [ int(d) for d in t_date]
        country.append(t_country)
        genre.append(t_genre)
        date.append(t_date)
    mlb = MultiLabelBinarizer()
    country = mlb.fit_transform(country)
    genre = mlb.fit_transform(genre)
    date = np.array(date)
    runtime = np.array(data_file['IMDB_RUNTIME']).reshape(data_length, 1)
    dir_detail = np.array(data_file[['DIRECTOR_WINS', 'DIRECTOR_NOMINATIONS', 
                                     'DIRECTOR_RATINGS']])
    star_detail = np.array(data_file[['STAR_1_WINS', 'STAR_1_NOMINATIONS', 
                                      'STAR_1_RATINGS', 'STAR_2_WINS', 
                                      'STAR_2_NOMINATIONS', 'STAR_2_RATINGS', 
                                      'STAR_3_WINS', 'STAR_3_NOMINATIONS', 
                                      'STAR_3_RATINGS']])
    yahoo = np.array(data_file[['YAHOO_EVALUATION', 'YAHOO_VOTER']])
    PTT = np.array(data_file[['PTT_ARTICLE', 'PTT_PUSH', 'PTT_ARROW', 
                              'PTT_PULL', 'PTT_REPLY']])
    youtube = np.array(data_file[['YOUTUBE_VIEW', 'YOUTUBE_LIKE', 'YOUTUBE_DISLIKE']])
    x_train = np.concatenate((country, genre, runtime, dir_detail, star_detail, 
                          yahoo, PTT, youtube), axis=-1)
    rating = np.array(data_file['IMDB_RATING']).reshape(data_length, 1)
    return x_train, rating
def set_class(rating, g_1, g_2):
    y_train = []
    for r in rating:
        temp = 0
        if r >= g_1:
            temp = 0
        elif r >= g_2:
            temp = 1
        else:
            temp = 2
        y_train.append(temp)
    y_train = np.array(y_train)
    return y_train

In [3]:
# read training data
x_train, rating = np.load('x_train.npy'), np.load('y_train.npy')
n_features = x_train.shape[1]

In [4]:
g_1, g_2 = 7.0, 6.0
y_train = set_class(rating, g_1, g_2)

In [5]:
# training normalization
mean = np.mean(x_train, axis=0)
sigma = np.std(x_train, axis=0)
x_train = (x_train-mean)/(sigma + 1e-20)

In [6]:
# read testing data
x_test, test_rating = np.load('x_test.npy'), np.load('y_test.npy')
y_test = set_class(test_rating, g_1, g_2)

# testing normalization
x_test = (x_test-mean)/(sigma + 1e-20)

In [7]:
def create_model(dnn):
    model = Sequential()
    model.add(Dense(32, input_shape=(x_train.shape[1],), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    for units in dnn:
        model.add(Dense(units, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
    model.add(Dense(3, activation='softmax'))
    return model

In [8]:
kfold = KFold(n_splits=10, shuffle=True, random_state=5)
k = 0
test_score = []
use_dnn = True
for train_index, valid_index in kfold.split(x_train):
    k += 1
    X, X_V = x_train[train_index], x_train[valid_index]
    Y, Y_V = y_train[train_index], y_train[valid_index]
    
    if use_dnn:
        # DNN
        Y, Y_V = np_utils.to_categorical(Y), np_utils.to_categorical(Y_V)
        dnn = [32, 32, 32]
        model = create_model(dnn)
        callbacks = []
        callbacks.append(EarlyStopping(monitor='val_loss', patience=3))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(X, Y, batch_size=32, validation_data=(X_V, Y_V), 
                  epochs=100, callbacks=callbacks)
        test_score.append(model.predict(x_test))
    else:
        # RF & GradientBoosting
#         clf = GradientBoostingClassifier(n_estimators=40, min_samples_split=40, min_samples_leaf=3,
#                                  max_leaf_nodes=15, max_depth=5, random_state=15)
#         clf.fit(X, Y)
#         test_score.append(clf.predict_proba(x_test))
#         print('kfold ' + str(k) +  ' acc: '+ str(clf.score(X_V, Y_V)))
        # Logistic Regression
        lr = LogisticRegression(penalty='l2',C = 0.001,random_state = 0)
        lr.fit(X, Y)
        test_score.append(lr.predict_proba(x_test))
        print('kfold ' + str(k) +  ' acc: '+ str(lr.score(X_V, Y_V)))

Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100


Epoch 4/100
Epoch 5/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Train on 159 samples, validate on 18 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Train on 160 samples, validate on 17 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Train on 160 samples, validate on 17 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100


Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 160 samples, validate on 17 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


In [9]:
# test accuracy
final_scores = np.zeros((len(x_test), 3))
for i in range(k):
    final_scores += test_score[i]
test_pred = [np.argmax(x) for x in final_scores]
test_acc = [1 if y_test[i] == test_pred[i] else 0 for i in range(len(y_test))]
test_acc = np.sum(test_acc) / len(y_test)
print('test_acc: ' + str(test_acc))

test_acc: 0.55
