In [176]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from tflearn.data_utils import to_categorical, pad_sequences
import numpy as np
from sklearn.neural_network import MLPClassifier

import tflearn

In [2]:
df = pd.read_pickle('genreClassification_multiclass.pkl')

In [3]:
X = df.drop('genres', axis=1)
y = df['genres']

In [4]:
def transformGenre(row):
    return row.split('|')

In [5]:
def binarizeGenre(row):
    binarized = 0 * 19
    for cat in to_categorical(le.transform(row), nb_classes=19):
        binarized += cat
    return binarized

In [6]:
X = df.drop('genres', axis=1)
y = df['genres']

In [7]:
y = y.apply(transformGenre)

In [8]:
genreNames = []
for genres in y:
    for genre in genres:
        genreNames.append(genre)
genreNames = list(set(genreNames))
genreNames.sort()

# Converting labels to binary vectors
le = preprocessing.LabelEncoder()
le = le.fit(list(genreNames))

In [9]:
y.head()

49695     [Adventure, Children, Fantasy]
99590          [Action, Crime, Thriller]
170830          [Comedy, Drama, Romance]
203230                    [Crime, Drama]
241291                          [Comedy]
Name: genres, dtype: object

In [10]:
y = y.apply(binarizeGenre)

In [11]:
y.head()

49695     [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
99590     [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
170830    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...
203230    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...
241291    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
Name: genres, dtype: object

In [12]:
t = np.ndarray((0,19))
for z in y.iteritems():
    t = np.concatenate((t, z[1].reshape(1,19)), axis=0)
t.shape

(1211, 19)

In [13]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, t, test_size=0.33, random_state=42)

In [154]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train['subtitles'])
print X_train_counts.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print X_train_tfidf.shape

X_test_counts = count_vect.transform(X_test['subtitles'])
print X_test_counts.shape

X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print X_test_tfidf.shape

(811, 100)
(400, 100)


### Vectorization for DNN

In [321]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', max_df=0.9)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['subtitles'])
print X_train_tfidf.shape

X_test_tfidf = tfidf_vectorizer.fit_transform(X_test['subtitles'])
print X_test_tfidf.shape

(811, 1000)
(400, 1000)


In [322]:
tfidf_vectorizer.get_feature_names()

[u'00',
 u'000',
 u'10',
 u'100',
 u'12',
 u'15',
 u'20',
 u'25',
 u'30',
 u'50',
 u'aah',
 u'able',
 u'absolutely',
 u'accept',
 u'accident',
 u'account',
 u'act',
 u'action',
 u'actually',
 u'address',
 u'afraid',
 u'afternoon',
 u'age',
 u'agent',
 u'ago',
 u'agree',
 u'ah',
 u'ahead',
 u'aii',
 u'ain',
 u'air',
 u'alive',
 u'amazing',
 u'america',
 u'american',
 u'angel',
 u'answer',
 u'anybody',
 u'anymore',
 u'apartment',
 u'appreciate',
 u'area',
 u'aren',
 u'arm',
 u'arms',
 u'army',
 u'arrest',
 u'art',
 u'ask',
 u'asked',
 u'asking',
 u'asleep',
 u'ass',
 u'asshole',
 u'attack',
 u'attention',
 u'baby',
 u'bad',
 u'bag',
 u'ball',
 u'balls',
 u'band',
 u'bank',
 u'bar',
 u'bastard',
 u'bathroom',
 u'bear',
 u'beat',
 u'beautiful',
 u'bed',
 u'beer',
 u'begin',
 u'beginning',
 u'bell',
 u'ben',
 u'best',
 u'bet',
 u'big',
 u'billy',
 u'bird',
 u'birthday',
 u'bit',
 u'bitch',
 u'bite',
 u'black',
 u'blood',
 u'bloody',
 u'blow',
 u'blue',
 u'boat',
 u'bob',
 u'bobby',
 u'body'

In [323]:
X_train_tfidf[2,:]

<1x1000 sparse matrix of type '<type 'numpy.float64'>'
	with 447 stored elements in Compressed Sparse Row format>

In [315]:
# Converting sparse matrix to list of features
X_train_vec = []
for vec in X_train_tfidf:
    X_train_vec.append(list(vec.indices))
    
X_test_vec = []
for vec in X_test_tfidf:
    X_test_vec.append(list(vec.indices))

In [316]:
# Sequence padding
X_train_vec = pad_sequences(X_train_vec, maxlen=1000, value=0.)
X_train_vec = pad_sequences(X_train_vec, maxlen=1000, value=0.)

X_test_vec = pad_sequences(X_test_vec, maxlen=1000, value=0.)
X_test_vec = pad_sequences(X_test_vec, maxlen=1000, value=0.)

In [317]:
X_train_vec.shape

(811, 1000)

## Neural Network - Multi-layer Perceptron

In [73]:
mlp = MLPClassifier(hidden_layer_sizes=(200, 200))

In [74]:
multi_target_mlp = MultiOutputClassifier(mlp, n_jobs=1)

In [75]:
predicted_mlp = multi_target_mlp.fit(X_train_tfidf, y_train).predict(X_test_tfidf)

In [76]:
print metrics.classification_report(y_test, predicted_mlp, le.transform(le.classes_), le.classes_)

             precision    recall  f1-score   support

     Action       0.79      0.42      0.55        99
  Adventure       1.00      0.11      0.19        65
  Animation       0.00      0.00      0.00        16
   Children       0.80      0.17      0.28        24
     Comedy       0.75      0.39      0.51       135
      Crime       0.47      0.12      0.19        68
Documentary       0.00      0.00      0.00         8
      Drama       0.65      0.81      0.72       203
    Fantasy       0.57      0.10      0.17        41
  Film-Noir       0.00      0.00      0.00         2
     Horror       0.59      0.22      0.32        45
       IMAX       0.00      0.00      0.00         4
    Musical       0.50      0.08      0.13        13
    Mystery       0.60      0.07      0.13        40
    Romance       0.41      0.10      0.16        70
     Sci-Fi       0.91      0.24      0.38        41
   Thriller       0.77      0.24      0.36       114
        War       0.67      0.16      0.26   

## Random Forest

In [66]:
rf = RandomForestClassifier(n_estimators=100, random_state=1)

In [67]:
multi_target_rf = MultiOutputClassifier(rf, n_jobs=1)

In [68]:
predicted_rf = multi_target_rf.fit(X_train_tfidf, y_train).predict(X_test_tfidf)

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [72]:
print metrics.classification_report(y_test, predicted_rf, le.transform(le.classes_), le.classes_)

             precision    recall  f1-score   support

     Action       0.83      0.20      0.33        99
  Adventure       1.00      0.09      0.17        65
  Animation       0.00      0.00      0.00        16
   Children       0.00      0.00      0.00        24
     Comedy       0.76      0.35      0.48       135
      Crime       0.43      0.09      0.15        68
Documentary       0.00      0.00      0.00         8
      Drama       0.68      0.52      0.59       203
    Fantasy       1.00      0.10      0.18        41
  Film-Noir       0.00      0.00      0.00         2
     Horror       0.57      0.18      0.27        45
       IMAX       0.00      0.00      0.00         4
    Musical       0.00      0.00      0.00        13
    Mystery       0.50      0.05      0.09        40
    Romance       0.29      0.06      0.10        70
     Sci-Fi       0.86      0.15      0.25        41
   Thriller       0.71      0.13      0.22       114
        War       0.50      0.08      0.14   

## Deep Neural Network - LSTM

In [318]:
temp = X_train_vec.tolist()

In [327]:
tt = set()
for i in [0,810]:
    for t in X_train_vec[i]:
        tt.add(t)
print sorted(set(tt))


[0, 1, 5, 7, 8, 9, 10, 11, 15, 16, 17, 20, 28, 33, 35, 36, 37, 39, 41, 43, 44, 45, 46, 47, 53, 54, 55, 59, 61, 63, 64, 65, 66, 67, 69, 72, 74, 77, 78, 82, 84, 85, 87, 90, 92, 93, 95, 97, 103, 106, 108, 113, 115, 117, 118, 119, 123, 124, 127, 129, 135, 137, 141, 142, 146, 147, 148, 149, 150, 157, 160, 161, 163, 170, 171, 172, 178, 184, 185, 186, 188, 189, 190, 193, 194, 195, 196, 204, 207, 210, 211, 212, 213, 214, 215, 221, 222, 223, 224, 226, 231, 232, 234, 236, 238, 239, 246, 247, 248, 255, 256, 257, 258, 261, 263, 265, 267, 269, 270, 271, 272, 273, 274, 275, 277, 278, 279, 280, 283, 286, 290, 291, 292, 294, 298, 299, 300, 301, 308, 309, 310, 311, 313, 314, 315, 316, 318, 320, 323, 324, 336, 337, 338, 339, 341, 343, 344, 347, 350, 352, 354, 356, 357, 358, 360, 362, 366, 368, 369, 370, 382, 385, 387, 388, 390, 393, 394, 397, 398, 399, 400, 401, 402, 403, 405, 407, 411, 412, 413, 415, 417, 419, 420, 422, 423, 424, 427, 429, 431, 432, 435, 436, 437, 439, 443, 444, 450, 452, 453, 463, 467

In [213]:
# Network building
net = tflearn.input_data([None, 1000])
net = tflearn.embedding(net, input_dim=1000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 19, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

In [214]:
# Training
dnn = tflearn.DNN(net, tensorboard_verbose=0, tensorboard_dir='./logs/')

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
Instructions for updating:
Please switch to tf.summary.merge.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tag

predicted_dnn = []
for c in le.transform(le.classes_):
    print 'Fitting the model for', le.inverse_transform(c)
    dnn.fit(X_train_vec, y_train[:,c], validation_set=0.1, show_metric=True, batch_size=32, run_id='dnn')
    predicted_dnn.append(multi_target_dnn.predict(X_test_vec))

In [220]:
dnn.fit(X_train_vec, y_train.transpose().tolist(), validation_set=0.1,
        show_metric=True, batch_size=32, run_id='dnn')

IndexError: list index out of range

In [212]:
net.get_shape

<bound method Tensor.get_shape of <tf.Tensor 'FullyConnected_3/Softmax:0' shape=(?, 19) dtype=float32>>

In [208]:
y_train.transpose().tolist()

[[0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,