<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#MODELING" data-toc-modified-id="MODELING-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>MODELING</a></span></li></ul></div>

In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import functools 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
import re
import os.path
import math
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.cross_validation import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy import sparse
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import tensorflow_hub as hub


import nltk
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

sns.set_style("whitegrid")
sns.set_context("talk", font_scale=0.8)

from helper_functions import *


  from numpy.core.umath_tests import inner1d
W0313 14:59:06.422702 24140 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smaiya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smaiya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def plot_similarity(labels, features, rotation):
    unique_labels = functools.reduce(lambda l, x: l if x in l else l+[x], labels, [])
    corr = np.inner(features, features)
    fig = plt.figure(figsize=(10,10))
    sns.set(font_scale=1.2)
    g = sns.heatmap(
        corr,
        xticklabels=unique_labels,
        yticklabels=unique_labels,
        vmin=0,
        vmax=1,
        cmap="YlOrRd")
    g.set_xticklabels(unique_labels, rotation=rotation)
    ticks = np.linspace(len(features)/len(unique_labels)/2, len(features)-len(features)/len(unique_labels)/2, len(unique_labels), dtype=np.int)
    g.set_xticks(ticks)
    g.set_yticks(ticks)
    g.set_title("Semantic Textual Similarity for: {}".format(unique_labels))


def run_and_plot(messages_, labels):
    similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
    similarity_message_encodings = embed(similarity_input_placeholder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())  
        message_embeddings_ = session.run(similarity_message_encodings, feed_dict={similarity_input_placeholder: messages_})
        plot_similarity(labels, message_embeddings_, 90)

In [3]:
def get_2d_representation(words_list, labels):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        description_embeddings  =  session.run(embed(words_list)) #Generates the sentence embeddings
        
        model = PCA(n_components = 2) # Initializing the PCA object. We use n_components = 2 to help us plot our findings in the 2D space.
        # Apply the fit_transform method of model to grains: pca_features
        pca_features = model.fit_transform(description_embeddings) # Transforms embeddings to vectors of size 2
        # Assign 0th column of pca_features: xs
        xs = pca_features[:,0] #The first component of PCA
        # Assign 1st column of pca_features: ys
        ys = pca_features[:,1] #Second component of PCA
        
        #Next, we'll plot these results
        #fig, ax = plt.subplots()
        tmp = {}
        label_idx = [tmp.setdefault(name, len(tmp)) for name in labels]
        unique_labels = functools.reduce(lambda l, x: l if x in l else l+[x], labels, [])
        df = pd.DataFrame({'x':xs, 'y': ys, 'label':label_idx})
        colors = ['r', 'b', 'g', 'm', 'k']
        
        fig, ax = plt.subplots(figsize=(10,10)) 
        for idx in range(len(set(labels))):
          ax.scatter(df[df['label']==idx].x, df[df['label']==idx].y, c=colors[idx], label=unique_labels[idx])
        
        ax.set(title='PCA Representation for Genres: {}'.format(unique_labels))
        ax.legend()
        #for i, txt in enumerate(words_list):
        #    ax.annotate(txt, (xs[i], ys[i]), fontsize=11)

# MODELING

In [4]:
mydata_train = pd.read_csv('./../Data/preprocessed/movies_genres_train_preprocessed.csv')
mydata_test = pd.read_csv('./../Data/preprocessed/movies_genres_test_preprocessed.csv')
mydata = pd.read_csv('../Data/movies_genres.csv', delimiter='\t')

train_X, train_y = mydata_train['plot'], mydata_train.drop(['title', 'plot', 'plot_lang'], axis=1)
test_X, test_y = mydata_test['plot'], mydata_test.drop(['title', 'plot', 'plot_lang'], axis=1)

category_columns = train_y.columns

In [6]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)

Instructions for updating:
Colocations handled automatically by placer.


W0313 15:09:27.741820 24140 deprecation.py:323] From C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [7]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    train_X_vector  =  session.run(embed(list(train_X))) #Generates the sentence embeddings

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0313 15:10:35.485645 24140 saver.py:1483] Saver not created because there are no variables in the graph to restore


ResourceExhaustedError: OOM when allocating tensor with shape[310433,320] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node module_apply_default/Encoder_en/Transformer/PrepareForTransformer/embedding_lookup/GatherV2_5 (defined at C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow_hub\native_module.py:517) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'module_apply_default/Encoder_en/Transformer/PrepareForTransformer/embedding_lookup/GatherV2_5', defined at:
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 499, in start
    self.io_loop.start()
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 523, in run_forever
    self._run_once()
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 1758, in _run_once
    handle._run()
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-25f71fab0278>", line 3, in <module>
    train_X_vector  =  session.run(embed(list(train_X))) #Generates the sentence embeddings
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow_hub\module.py", line 250, in __call__
    name=name)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow_hub\native_module.py", line 517, in create_apply_graph
    import_scope=relative_scope_name)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1435, in import_meta_graph
    meta_graph_or_file, clear_devices, import_scope, **kwargs)[0]
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1457, in _import_meta_graph_with_return_elements
    **kwargs))
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\meta_graph.py", line 806, in import_scoped_meta_graph_with_return_elements
    return_elements=return_elements)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\importer.py", line 442, in import_graph_def
    _ProcessNewOps(graph)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\importer.py", line 235, in _ProcessNewOps
    for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3433, in _add_new_tf_operations
    for c_op in c_api_util.new_tf_operations(self)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3433, in <listcomp>
    for c_op in c_api_util.new_tf_operations(self)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3325, in _create_op_from_tf_operation
    ret = Operation(c_op, self)
  File "C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[310433,320] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node module_apply_default/Encoder_en/Transformer/PrepareForTransformer/embedding_lookup/GatherV2_5 (defined at C:\Users\smaiya\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow_hub\native_module.py:517) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

