In [1]:
import pandas as pd
import ast
from collections import Counter
import numpy as np

## DATA FORMATTING
# Step 1. Reading the data in!
# Metadata ==> id, name of movie, gross boxoffice, genre
metadata = pd.read_csv('./MovieSummaries/movie.metadata.tsv', sep='\t', 
                       header= None, 
                       usecols = [0,2,4,8],
                      names = ['id','name','gross','genre'])

# Plot summaries
with open('./MovieSummaries/plot_summaries.txt', encoding='utf8') as fp:
    plots = fp.readlines()

# Step 2. Cleaning the null values from gross box office
filter_metadata = metadata[metadata['gross'].notnull()]

# Step 3. Reading in the genre column
filter_metadata["genre"] =  filter_metadata["genre"].map(lambda d : list(ast.literal_eval(d).values()))

# Step 4: Finding the top genres
all_genres = list(filter_metadata['genre'])
all_genres_flat = [item for sublist in all_genres for item in sublist]
genre_counter = Counter(all_genres_flat)
top_genres = [x[0] for x in genre_counter.most_common(10)]

# Step 5: Filtering on top genres
keep_genres = []
for item in all_genres:
    gens = list(set(item).intersection(set(top_genres)))
    if len(gens)>0:
        keep_genres.append(gens)
    else:
        keep_genres.append(np.nan)

filter_metadata['genre'] = keep_genres
filter_metadata = filter_metadata[filter_metadata['genre'].notnull()]

# Step 6: joining plots to metadata!
plots = {x.split('\t')[0]:x.split('\t')[1] for x in plots}
filter_metadata['plots'] = [plots[str(key)] if str(key) in plots else np.nan for key in filter_metadata['id']]
filter_metadata = filter_metadata[filter_metadata['plots'].notnull()]

# WHEW!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [2]:
# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(filter_metadata['plots'], 
                                                    filter_metadata['genre'], 
                                                    test_size = .30, 
                                                    random_state=10)

In [3]:
X_train.head(3)

18166    Robert Dean is an old-fashioned psychologist w...
66401    Detective Allen Gamble  is a mild-mannered for...
64538    A babysitting uncle tells his charges three ho...
Name: plots, dtype: object

In [4]:
y_train.head(3)

18166                                             [Comedy]
66401    [Action, Action/Adventure, Comedy, Crime Fiction]
64538                                           [Thriller]
Name: genre, dtype: object

### w2v

In [8]:
# Building word2vec model
from nltk.tokenize import PunktSentenceTokenizer
import string

sentences = []
for item in filter_metadata['plots']:
    sentences.extend([[w.translate(str.maketrans('','',string.punctuation)).strip().lower() for w in sent.split()]\
                      for sent in PunktSentenceTokenizer().tokenize(item)])

In [9]:
sentences[0]

['set',
 'in',
 'the',
 'second',
 'half',
 'of',
 'the',
 '22nd',
 'century',
 'the',
 'film',
 'depicts',
 'mars',
 'as',
 'a',
 'planet',
 'that',
 'has',
 'been',
 '84',
 'terraformed',
 'allowing',
 'humans',
 'to',
 'walk',
 'on',
 'the',
 'surface',
 'without',
 'wearing',
 'pressure',
 'suits']

In [10]:
import gensim

model = gensim.models.Word2Vec (sentences, size=150, window=10, min_count=2, workers=10)
model.train(sentences,total_examples=len(sentences),epochs=10)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [11]:
# Checking word2vec model
model.similar_by_word('tree')

  


[('rocks', 0.6076042652130127),
 ('hole', 0.606200635433197),
 ('rain', 0.6034523248672485),
 ('trees', 0.5994110107421875),
 ('lake', 0.5991060733795166),
 ('water', 0.5886483192443848),
 ('pile', 0.5873029232025146),
 ('ground', 0.5796843767166138),
 ('sand', 0.5770130157470703),
 ('pond', 0.5752089023590088)]

In [12]:
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(list(word2vec.values())[0])

    def fit(self, X, y):
        tfidf = TfidfVectorizer(stop_words='english' ,analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(y_train) 
test_labels = mlb.transform(y_test)


In [13]:
import numpy as np
import keras

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
#from keras.utils import multi_gpu_model

max_words = 1000
batch_size = 64
epochs = 20

num_classes = 10
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
x_train = tokenizer.texts_to_matrix(X_train)
x_test = tokenizer.texts_to_matrix(X_test)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

# Borrow our binarized labels from the previous model
y_train = train_labels
y_test = test_labels
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

10 classes
Vectorizing sequence data...
x_train shape: (5037,)
x_test shape: (2159,)
y_train shape: (5037, 10)
y_test shape: (2159, 10)


In [21]:
x_train[0]

array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1.,
       1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0.

In [20]:
x_train.shape

(5037, 1000)

In [14]:
X_train.head(3)

18166    Robert Dean is an old-fashioned psychologist w...
66401    Detective Allen Gamble  is a mild-mannered for...
64538    A babysitting uncle tells his charges three ho...
Name: plots, dtype: object

In [16]:
y_train[:4]

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 1, 1, 0, 0, 1, 0, 0, 0, 0]])

### Model

In [17]:
print('Building model...')
model = Sequential()
model.add(Embedding(max_words, 100, input_length= x_train.shape[1] ))
model.add(Flatten())
model.add(Dense(256, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation="sigmoid"))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Building model...


In [None]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])
