<a href="https://colab.research.google.com/github/slin35/RobotProducer/blob/main/lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
#!unzip data.zip

'''
!rm -rf Cast/
!rm train_data.csv
!rm test_data.csv
!rm bert_genres_prediction.csv
!rm gnb_prediction_for_director.csv
'''

'\n!rm -rf Cast/\n!rm train_data.csv\n!rm test_data.csv\n!rm bert_genres_prediction.csv\n!rm gnb_prediction_for_director.csv\n'

In [51]:
!pip install transformers
!pip install bert-tensorflow==1.0.1



In [37]:
%tensorflow_version 1.x
import tensorflow as tf
import tensorflow_hub as hub

from transformers import pipeline

import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import os
import collections
from datetime import datetime

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
from bert import run_classifier_with_tfhub

In [38]:


# predict the title of the movie
def get_title(overview: str):

  question_answerer = pipeline('question-answering')
  answer = question_answerer({
    'question': 'What is the name of the movie ?',
    'context': overview
  })

  return answer['answer']


In [39]:
def get_data():
  # import data
  data1 = pd.read_csv('./tmdb_5000_movies.csv')
  data2 = pd.read_csv('./tmdb_5000_credits.csv')

  data1 = data1[['genres', 'title', 'overview', 'id']]
  data2 = data2[['movie_id', 'title', 'cast', 'crew']]

  data = pd.merge(data1, data2, left_on='id', right_on='movie_id', how='inner')
  data = data[['id', 'title_x', 'overview', 'genres', 'cast', 'crew']]
  data.rename({'title_x': 'title'}, axis=1, inplace=True)

  # cleanup genres
  data['genres'] = [json.loads(i) if i != [] else [] for i in data['genres']]
  data['genres'] = [[j['name'] if 'name' in j else [] for j in i] for i in data['genres']]

  # cleanup cast
  data['cast'] = [json.loads(i) if i != [] else [] for i in data['cast']]
  data['cast'] = [[j['name'] if 'name' in j else [] for j in i] for i in data['cast']]

  # cleanup crew
  data['crew'] = [json.loads(i) if i != [] else [] for i in data['crew']]
  data['crew'] = [[j['name']  for j in i if 'job' in j and j['job'] == 'Director'] for i in data['crew']]

  # get rid of rows with empty values in overview, genres, cast, crew
  data.replace("", float("NaN"), inplace=True)
  data.dropna(inplace=True)
  data = data[~data.genres.str.len().eq(0)]
  data = data[~data.cast.str.len().eq(0)]
  data = data[~data.crew.str.len().eq(0)]

  return data

# get rid of data if grouped by label that has length <= 1
def cleanup(data: pd.DataFrame, label: str):
  result = []
  res = []
  indices = []
  for idx, row in data.iterrows():
    for item in row[label]:
      if item not in res:
        res.append(item)
      else:
        if idx not in indices:
          result.append(row)
          indices.append(idx)
  result_df = pd.DataFrame(result)
  result_df.columns = data.columns
  return result_df

# returning all unique labels from a column
def get_unique(data: pd.DataFrame, label: str):
  result = []
  [[result.append(item) for item in row if item not in result] for row in data[label]]
  return result

In [40]:

# BERT-based, uncased model: 12-layer, 768-hidden, 12-heads, 110M parematers
BERT_VOCAB= './uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = './uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = './uncased_L-12_H-768_A-12/bert_config.json'


# BERT-large, uncased model: 24-layer, 1024-hidden, 16-heads, 340M parameters
#BERT_VOCAB= './uncased_L-24_H-1024_A-16/vocab.txt'
#BERT_INIT_CHKPNT = './uncased_L-24_H-1024_A-16/bert_model.ckpt'
#BERT_CONFIG = './uncased_L-24_H-1024_A-16/bert_config.json'

def get_tokenizer():
  tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
  return tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True)
  

# InputExample for object with multiple labels
class InputExample(object):
  def __init__(self, guid, text_a, text_b=None, labels=None):
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.labels = labels

# InputFeature for object with multiple labels
class InputFeatures(object):
  def __init__(self, input_ids, input_mask, segment_ids, label_ids, is_real_example=True):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_ids = label_ids
    self.is_real_example = is_real_example

# create InputExamples
def create_examples(df: pd.DataFrame, list_of_train_labels: list, num_labels: int, set_labels=True):
  examples = []
  
  for index, row in df.iterrows():
    guid = row['id']
    text_a = row['overview']
    if set_labels:
      labels = list_of_train_labels[index]
    else:
      labels = [0] * num_labels
    examples.append(InputExample(guid=guid, text_a=text_a, labels=labels))
  return examples

# for a single sequence convert_example_to_features
def convert_examples_to_features(examples, max_seq_length, tokenizer):
  features = []

  for (idx, example) in enumerate(examples):

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None;

    if len(tokens_a) > max_seq_length - 2:
      tokens_a = tokens_a[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)           # its a single sequence, so segment_ids are all 0

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    input_mask = [1] * len(input_ids)

    # padding up to sequence length
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding 
    segment_ids += padding 

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    labels_ids = []
    
    for label in example.labels:
        labels_ids.append(int(label))

    feature = InputFeatures(input_ids, input_mask, segment_ids, labels_ids)
    features.append(feature)

  return features


In [41]:

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        
        # probabilities = tf.nn.softmax(logits, axis=-1) ### multiclass case
        probabilities = tf.nn.sigmoid(logits)#### multi-label case
        
        labels = tf.cast(labels, tf.float32)
        tf.logging.info("num_labels:{};logits:{};labels:{}".format(num_labels, logits, labels))
        per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
        loss = tf.reduce_mean(per_example_loss)

        # probabilities = tf.nn.softmax(logits, axis=-1)
        # log_probs = tf.nn.log_softmax(logits, axis=-1)
        #
        # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        #
        # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        # loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, logits, probabilities)


def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        #tf.logging.info("*** Features ***")
        #for name in sorted(features.keys()):
        #    tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        is_real_example = None
        if "is_real_example" in features:
             is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
        else:
             is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits, probabilities) = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
            num_labels, use_one_hot_embeddings)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            #tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, probabilities, is_real_example):

                logits_split = tf.split(probabilities, num_labels, axis=-1)
                label_ids_split = tf.split(label_ids, num_labels, axis=-1)
                # metrics change to auc of every class
                eval_dict = {}
                for j, logits in enumerate(logits_split):
                    label_id_ = tf.cast(label_ids_split[j], dtype=tf.int32)
                    current_auc, update_op_auc = tf.metrics.auc(label_id_, logits)
                    eval_dict[str(j)] = (current_auc, update_op_auc)
                eval_dict['eval_loss'] = tf.metrics.mean(values=per_example_loss)
                return eval_dict

                ## original eval metrics
                # predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                # accuracy = tf.metrics.accuracy(
                #     labels=label_ids, predictions=predictions, weights=is_real_example)
                # loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
                # return {
                #     "eval_accuracy": accuracy,
                #     "eval_loss": loss,
                # }

            eval_metrics = metric_fn(per_example_loss, label_ids, probabilities, is_real_example)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metric_ops=eval_metrics,
                scaffold=scaffold_fn)
        else:
            print("mode:", mode,"probabilities:", probabilities)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={"probabilities": probabilities},
                scaffold=scaffold_fn)
        return output_spec

    return model_fn

In [42]:

class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """
    
    
def convert_single_example(ex_index, example, max_seq_length,
                           tokenizer):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_ids=0,
            is_real_example=False)

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    labels_ids = []
    for label in example.labels:
        labels_ids.append(int(label))


    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=labels_ids,
        is_real_example=True)
    return feature


def file_based_convert_examples_to_features(
        examples, max_seq_length, tokenizer, output_file):
    """Convert a set of `InputExample`s to a TFRecord file."""

    writer = tf.python_io.TFRecordWriter(output_file)

    for (ex_index, example) in enumerate(examples):
        #if ex_index % 10000 == 0:
            #tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(ex_index, example,
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])
        if isinstance(feature.label_ids, list):
            label_ids = feature.label_ids
        else:
            label_ids = feature.label_ids[0]
        features["label_ids"] = create_int_feature(label_ids)

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()


def file_based_input_fn_builder(input_file, seq_length, is_training,
                                drop_remainder, num_labels):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([num_labels], tf.int64),
        "is_real_example": tf.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t

        return example

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))

        return d

    return input_fn


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [43]:
def input_fn_builder(features, seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_ids)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples, len(LABEL_COLUMNS)], dtype=tf.int32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn

In [59]:
def create_output(predictions, list_of_labels, k):
    probabilities = []
    predict = []
    for (i, prediction) in enumerate(predictions):
        preds = prediction["probabilities"]
        probabilities.append(preds)
        if len(preds) == 0:
          predict.append([])
        elif len(preds) <= k:
          predict.append(preds)
        else:
          indices = np.argpartition(preds, -k)[-k:]
          predict.append([list_of_labels[i] for i in indices])
    dff = pd.DataFrame(probabilities)
    dff.columns = list_of_labels
    dff.insert(0, 'prediction', predict)
    dff.insert(0, 'probabilities', probabilities)
    
    return dff, probabilities

def get_accuracy(df: pd.DataFrame, label_column: str, num_test_examples: int):
  correct = 0
  for idx, row in df.iterrows():
    predictions = row['prediction']
    actual_labels = row[label_column]

    for prediction in predictions:
      if prediction in actual_labels:
        correct += 1
  
  return correct / num_test_examples

def get_accuracy_topk(df: pd.DataFrame, label_column: str, num_test_examples: int, k: int):
  correct = 0
  for idx, row in df.iterrows():
    predictions = row['prediction']
    actual_labels = row[label_column][:5]
    for prediction in predictions:
      if prediction in actual_labels:
        correct += 1
  
  return correct / num_test_examples

In [60]:
MAX_SEQ_LENGTH = 128

# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 16
#LEARNING_RATE = 2e-5
LEARNING_RATE = 3e-5
NUM_TRAIN_EPOCHS = 10.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.3
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [44]:
data = get_data()
data = cleanup(data, 'cast')
data = cleanup(data, 'crew')

'''
# cleanup genres
data['genres'] = [i.replace('[', '').replace(']', '').replace('\'', '').split(', ') for i in data['genres']]

# cleanup cast
data['cast'] = [i.replace('[', '').replace(']', '').replace('\'', '').split(', ') for i in data['cast']]

# cleanup crew
data['crew'] = [i.replace('[', '').replace(']', '').replace('\'', '').split(', ') for i in data['crew']]


# get rid of rows with empty values in overview, genres, cast, crew
data.replace("", float("NaN"), inplace=True)
data.dropna(inplace=True)
data = data[~data.genres.str.len().eq(0)]
data = data[~data.cast.str.len().eq(0)]
data = data[~data.crew.str.len().eq(0)]
'''

# get unique genres
genres = get_unique(data, 'genres')

# get unique directors
directors = get_unique(data, 'crew')

# get unique cast members
cast = get_unique(data, 'cast')

# splitting data into training and test set
train, test = train_test_split(data, test_size = 0.3)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train.to_csv("train_data.csv", index=False)
test.to_csv("test_data.csv", index=False)

tokenizer = get_tokenizer()

print(f'size of the training set: {len(train)}, size of the test set: {len(test)}')


size of the training set: 1708, size of the test set: 732


In [62]:
# predict cast from overview

LABEL_COLUMN = 'cast'
LABELS = cast
NUM_LABELS = len(LABELS)
OUTPUT_DIR = './Cast'

if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)

def get_list_of_train_labels():
  result = []
  for i, row in train.iterrows():
    labels = [0] * NUM_LABELS
    for item in row[LABEL_COLUMN]:
      idx = LABELS.index(item)
      labels[idx] = 1
    result.append(labels)
  return result
  
list_of_train_labels = get_list_of_train_labels()

train_inputExamples = create_examples(df=train, list_of_train_labels=list_of_train_labels, num_labels=NUM_LABELS)
test_inputExamples = create_examples(df=test, list_of_train_labels=list_of_train_labels, num_labels=NUM_LABELS, set_labels=False)

train_features = convert_examples_to_features(train_inputExamples, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_features(test_inputExamples, MAX_SEQ_LENGTH, tokenizer)

num_train_steps = int(len(train_inputExamples) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

train_file = os.path.join(OUTPUT_DIR, "train.tf_record")
if not os.path.exists(train_file):
  open(train_file, 'w').close()

file_based_convert_examples_to_features(train_inputExamples, MAX_SEQ_LENGTH, tokenizer, train_file)

tf.logging.info("***** Running training *****")
tf.logging.info("  Num examples = %d", len(train_inputExamples))
tf.logging.info("  Batch size = %d", BATCH_SIZE)
tf.logging.info("  Num steps = %d", num_train_steps)

train_input_fn = file_based_input_fn_builder(train_file, MAX_SEQ_LENGTH, is_training=True, drop_remainder=True, num_labels=NUM_LABELS)

run_config = tf.estimator.RunConfig(
    model_dir = OUTPUT_DIR,
    save_summary_steps = SAVE_SUMMARY_STEPS,
    keep_checkpoint_max = 1,
    save_checkpoints_steps = SAVE_CHECKPOINTS_STEPS
)

bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels= NUM_LABELS,
  init_checkpoint=BERT_INIT_CHKPNT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})


INFO:tensorflow:***** Running training *****


INFO:tensorflow:***** Running training *****


INFO:tensorflow:  Num examples = 1708


INFO:tensorflow:  Num examples = 1708


INFO:tensorflow:  Batch size = 16


INFO:tensorflow:  Batch size = 16


INFO:tensorflow:  Num steps = 1067


INFO:tensorflow:  Num steps = 1067


INFO:tensorflow:Using config: {'_model_dir': './Cast', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f87c8a76390>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': './Cast', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f87c8a76390>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [63]:
print('Beginning Training!')
curtime = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training time elapsed: ", datetime.now() - curtime)

Beginning Training!
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.







INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.




















INFO:tensorflow:num_labels:35326;logits:Tensor("loss/BiasAdd:0", shape=(16, 35326), dtype=float32);labels:Tensor("loss/Cast:0", shape=(16, 35326), dtype=float32)


INFO:tensorflow:num_labels:35326;logits:Tensor("loss/BiasAdd:0", shape=(16, 35326), dtype=float32);labels:Tensor("loss/Cast:0", shape=(16, 35326), dtype=float32)


INFO:tensorflow:**** Trainable Variables ****


INFO:tensorflow:**** Trainable Variables ****














INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 0 into ./Cast/model.ckpt.


INFO:tensorflow:Saving checkpoints for 0 into ./Cast/model.ckpt.


INFO:tensorflow:loss = 0.7067614, step = 0


INFO:tensorflow:loss = 0.7067614, step = 0


INFO:tensorflow:global_step/sec: 1.68237


INFO:tensorflow:global_step/sec: 1.68237


INFO:tensorflow:loss = 0.36288947, step = 100 (59.445 sec)


INFO:tensorflow:loss = 0.36288947, step = 100 (59.445 sec)


INFO:tensorflow:global_step/sec: 1.99433


INFO:tensorflow:global_step/sec: 1.99433


INFO:tensorflow:loss = 0.09882835, step = 200 (50.140 sec)


INFO:tensorflow:loss = 0.09882835, step = 200 (50.140 sec)


INFO:tensorflow:global_step/sec: 1.94819


INFO:tensorflow:global_step/sec: 1.94819


INFO:tensorflow:loss = 0.04555868, step = 300 (51.334 sec)


INFO:tensorflow:loss = 0.04555868, step = 300 (51.334 sec)


INFO:tensorflow:global_step/sec: 1.95646


INFO:tensorflow:global_step/sec: 1.95646


INFO:tensorflow:loss = 0.02883321, step = 400 (51.112 sec)


INFO:tensorflow:loss = 0.02883321, step = 400 (51.112 sec)


INFO:tensorflow:Saving checkpoints for 500 into ./Cast/model.ckpt.


INFO:tensorflow:Saving checkpoints for 500 into ./Cast/model.ckpt.


INFO:tensorflow:global_step/sec: 1.58245


INFO:tensorflow:global_step/sec: 1.58245


INFO:tensorflow:loss = 0.023956275, step = 500 (63.190 sec)


INFO:tensorflow:loss = 0.023956275, step = 500 (63.190 sec)


INFO:tensorflow:global_step/sec: 1.93878


INFO:tensorflow:global_step/sec: 1.93878


INFO:tensorflow:loss = 0.02107823, step = 600 (51.580 sec)


INFO:tensorflow:loss = 0.02107823, step = 600 (51.580 sec)


INFO:tensorflow:global_step/sec: 1.96306


INFO:tensorflow:global_step/sec: 1.96306


INFO:tensorflow:loss = 0.019376136, step = 700 (50.944 sec)


INFO:tensorflow:loss = 0.019376136, step = 700 (50.944 sec)


INFO:tensorflow:global_step/sec: 1.94972


INFO:tensorflow:global_step/sec: 1.94972


INFO:tensorflow:loss = 0.016932001, step = 800 (51.286 sec)


INFO:tensorflow:loss = 0.016932001, step = 800 (51.286 sec)


INFO:tensorflow:global_step/sec: 1.95393


INFO:tensorflow:global_step/sec: 1.95393


INFO:tensorflow:loss = 0.01769484, step = 900 (51.181 sec)


INFO:tensorflow:loss = 0.01769484, step = 900 (51.181 sec)


INFO:tensorflow:Saving checkpoints for 1000 into ./Cast/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1000 into ./Cast/model.ckpt.


INFO:tensorflow:global_step/sec: 1.55185


INFO:tensorflow:global_step/sec: 1.55185


INFO:tensorflow:loss = 0.015264658, step = 1000 (64.435 sec)


INFO:tensorflow:loss = 0.015264658, step = 1000 (64.435 sec)


INFO:tensorflow:Saving checkpoints for 1067 into ./Cast/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1067 into ./Cast/model.ckpt.


INFO:tensorflow:Loss for final step: 0.016286362.


INFO:tensorflow:Loss for final step: 0.016286362.


Training time elapsed:  0:10:58.217259


In [64]:

test_file = os.path.join(OUTPUT_DIR, "test.tf_record")
if not os.path.exists(test_file):
  open(test_file, 'w').close()

file_based_convert_examples_to_features(test_inputExamples, MAX_SEQ_LENGTH, tokenizer, test_file)

predict_input_fn = file_based_input_fn_builder(
    input_file=test_file, 
    seq_length=MAX_SEQ_LENGTH, 
    is_training=False,
    drop_remainder=False,
    num_labels=NUM_LABELS)

print('Beginning predictions!')
curtime = datetime.now()
predictions = estimator.predict(predict_input_fn)
print("Predictioin time elapsed: ", datetime.now() - curtime)



Beginning predictions!
Predictioin time elapsed:  0:00:00.000079


In [65]:
output_df, probabilities = create_output(predictions, LABELS, 10)

output_df['id'] = np.asarray(test['id'])
result_df = pd.merge(test, output_df, on='id', how='inner')
#result_df.to_csv("bert_genres_prediction.csv", index=False)
#result_df.to_csv("bert_directior_prediction.csv", index=False)
result_df.to_csv("bert_cast_prediction.csv", index=False)
accuracy = get_accuracy(result_df, LABEL_COLUMN, len(test))
print(f'cast prediction: {accuracy * len(test)} correct predictions in {len(test)} test cases with an accuracy of {accuracy:.9f}')
top5_accuracy = get_accuracy_topk(result_df, LABEL_COLUMN, len(test), 5)
accuracy = get_accuracy(result_df, LABEL_COLUMN, len(test))
print(f'cast prediction: {accuracy * len(test)} correct predictions appear top5 of cast list in {len(test)} test cases with an accuracy of {accuracy:.9f}')





INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:num_labels:35326;logits:Tensor("loss/BiasAdd:0", shape=(?, 35326), dtype=float32);labels:Tensor("loss/Cast:0", shape=(?, 35326), dtype=float32)


INFO:tensorflow:num_labels:35326;logits:Tensor("loss/BiasAdd:0", shape=(?, 35326), dtype=float32);labels:Tensor("loss/Cast:0", shape=(?, 35326), dtype=float32)


INFO:tensorflow:**** Trainable Variables ****


INFO:tensorflow:**** Trainable Variables ****


mode: infer probabilities: Tensor("loss/Sigmoid:0", shape=(?, 35326), dtype=float32)
INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from ./Cast/model.ckpt-1067


INFO:tensorflow:Restoring parameters from ./Cast/model.ckpt-1067


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


cast prediction: 40.0 correct predictions in 732 test cases with an accuracy of 0.054644809
cast prediction: 40.0 correct predictions appear top5 of cast list in 732 test cases with an accuracy of 0.054644809


In [66]:
# predict genres from overview

LABEL_COLUMN = 'genres'
LABELS = genres
NUM_LABELS = len(LABELS)
OUTPUT_DIR = './Genres'

if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)

def get_list_of_train_labels():
  result = []
  for i, row in train.iterrows():
    labels = [0] * NUM_LABELS
    for item in row[LABEL_COLUMN]:
      idx = LABELS.index(item)
      labels[idx] = 1
    result.append(labels)
  return result
  
list_of_train_labels = get_list_of_train_labels()

train_inputExamples = create_examples(df=train, list_of_train_labels=list_of_train_labels, num_labels=NUM_LABELS)
test_inputExamples = create_examples(df=test, list_of_train_labels=list_of_train_labels, num_labels=NUM_LABELS, set_labels=False)

train_features = convert_examples_to_features(train_inputExamples, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_features(test_inputExamples, MAX_SEQ_LENGTH, tokenizer)

num_train_steps = int(len(train_inputExamples) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

train_file = os.path.join(OUTPUT_DIR, "train.tf_record")
if not os.path.exists(train_file):
  open(train_file, 'w').close()

file_based_convert_examples_to_features(train_inputExamples, MAX_SEQ_LENGTH, tokenizer, train_file)

tf.logging.info("***** Running training *****")
tf.logging.info("  Num examples = %d", len(train_inputExamples))
tf.logging.info("  Batch size = %d", BATCH_SIZE)
tf.logging.info("  Num steps = %d", num_train_steps)

train_input_fn = file_based_input_fn_builder(train_file, MAX_SEQ_LENGTH, is_training=True, drop_remainder=True, num_labels=NUM_LABELS)

run_config = tf.estimator.RunConfig(
    model_dir = OUTPUT_DIR,
    save_summary_steps = SAVE_SUMMARY_STEPS,
    keep_checkpoint_max = 1,
    save_checkpoints_steps = SAVE_CHECKPOINTS_STEPS
)

bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels= NUM_LABELS,
  init_checkpoint=BERT_INIT_CHKPNT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})


print('Beginning Training!')
curtime = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training time elapsed: ", datetime.now() - curtime)


test_file = os.path.join(OUTPUT_DIR, "test.tf_record")
if not os.path.exists(test_file):
  open(test_file, 'w').close()

file_based_convert_examples_to_features(test_inputExamples, MAX_SEQ_LENGTH, tokenizer, test_file)

predict_input_fn = file_based_input_fn_builder(
    input_file=test_file, 
    seq_length=MAX_SEQ_LENGTH, 
    is_training=False,
    drop_remainder=False,
    num_labels=NUM_LABELS)

print('Beginning predictions!')
curtime = datetime.now()
predictions = estimator.predict(predict_input_fn)
print("Predictioin time elapsed: ", datetime.now() - curtime)

output_df, probabilities = create_output(predictions, LABELS, 5)

output_df['id'] = np.asarray(test['id'])
result_df = pd.merge(test, output_df, on='id', how='inner')
result_df.to_csv("bert_genres_prediction.csv", index=False)
#result_df.to_csv("bert_directior_prediction.csv", index=False)
accuracy = get_accuracy(result_df, LABEL_COLUMN, len(test))
print(f'genres prediction: {accuracy * len(test)} correct predictions in {len(test)} test cases')


INFO:tensorflow:***** Running training *****


INFO:tensorflow:***** Running training *****


INFO:tensorflow:  Num examples = 1708


INFO:tensorflow:  Num examples = 1708


INFO:tensorflow:  Batch size = 16


INFO:tensorflow:  Batch size = 16


INFO:tensorflow:  Num steps = 1067


INFO:tensorflow:  Num steps = 1067


INFO:tensorflow:Using config: {'_model_dir': './Genres', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f87d02a9990>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': './Genres', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f87d02a9990>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


Beginning Training!




INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:num_labels:20;logits:Tensor("loss/BiasAdd:0", shape=(16, 20), dtype=float32);labels:Tensor("loss/Cast:0", shape=(16, 20), dtype=float32)


INFO:tensorflow:num_labels:20;logits:Tensor("loss/BiasAdd:0", shape=(16, 20), dtype=float32);labels:Tensor("loss/Cast:0", shape=(16, 20), dtype=float32)


INFO:tensorflow:**** Trainable Variables ****


INFO:tensorflow:**** Trainable Variables ****


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 0 into ./Genres/model.ckpt.


INFO:tensorflow:Saving checkpoints for 0 into ./Genres/model.ckpt.


INFO:tensorflow:loss = 0.74215263, step = 0


INFO:tensorflow:loss = 0.74215263, step = 0


INFO:tensorflow:global_step/sec: 1.65125


INFO:tensorflow:global_step/sec: 1.65125


INFO:tensorflow:loss = 0.32352966, step = 100 (60.564 sec)


INFO:tensorflow:loss = 0.32352966, step = 100 (60.564 sec)


INFO:tensorflow:global_step/sec: 2.04277


INFO:tensorflow:global_step/sec: 2.04277


INFO:tensorflow:loss = 0.28237674, step = 200 (48.953 sec)


INFO:tensorflow:loss = 0.28237674, step = 200 (48.953 sec)


INFO:tensorflow:global_step/sec: 2.04999


INFO:tensorflow:global_step/sec: 2.04999


INFO:tensorflow:loss = 0.25112695, step = 300 (48.779 sec)


INFO:tensorflow:loss = 0.25112695, step = 300 (48.779 sec)


INFO:tensorflow:global_step/sec: 2.04224


INFO:tensorflow:global_step/sec: 2.04224


INFO:tensorflow:loss = 0.15677008, step = 400 (48.967 sec)


INFO:tensorflow:loss = 0.15677008, step = 400 (48.967 sec)


INFO:tensorflow:Saving checkpoints for 500 into ./Genres/model.ckpt.


INFO:tensorflow:Saving checkpoints for 500 into ./Genres/model.ckpt.


INFO:tensorflow:global_step/sec: 1.82593


INFO:tensorflow:global_step/sec: 1.82593


INFO:tensorflow:loss = 0.17972413, step = 500 (54.769 sec)


INFO:tensorflow:loss = 0.17972413, step = 500 (54.769 sec)


INFO:tensorflow:global_step/sec: 2.03861


INFO:tensorflow:global_step/sec: 2.03861


INFO:tensorflow:loss = 0.13021559, step = 600 (49.053 sec)


INFO:tensorflow:loss = 0.13021559, step = 600 (49.053 sec)


INFO:tensorflow:global_step/sec: 2.04921


INFO:tensorflow:global_step/sec: 2.04921


INFO:tensorflow:loss = 0.1472887, step = 700 (48.798 sec)


INFO:tensorflow:loss = 0.1472887, step = 700 (48.798 sec)


INFO:tensorflow:global_step/sec: 2.04279


INFO:tensorflow:global_step/sec: 2.04279


INFO:tensorflow:loss = 0.10085418, step = 800 (48.951 sec)


INFO:tensorflow:loss = 0.10085418, step = 800 (48.951 sec)


INFO:tensorflow:global_step/sec: 2.04567


INFO:tensorflow:global_step/sec: 2.04567


INFO:tensorflow:loss = 0.14652748, step = 900 (48.883 sec)


INFO:tensorflow:loss = 0.14652748, step = 900 (48.883 sec)


INFO:tensorflow:Saving checkpoints for 1000 into ./Genres/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1000 into ./Genres/model.ckpt.


INFO:tensorflow:global_step/sec: 1.82526


INFO:tensorflow:global_step/sec: 1.82526


INFO:tensorflow:loss = 0.12749434, step = 1000 (54.792 sec)


INFO:tensorflow:loss = 0.12749434, step = 1000 (54.792 sec)


INFO:tensorflow:Saving checkpoints for 1067 into ./Genres/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1067 into ./Genres/model.ckpt.


INFO:tensorflow:Loss for final step: 0.118924715.


INFO:tensorflow:Loss for final step: 0.118924715.


Training time elapsed:  0:09:48.545594
Beginning predictions!
Predictioin time elapsed:  0:00:00.000101




INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:num_labels:20;logits:Tensor("loss/BiasAdd:0", shape=(?, 20), dtype=float32);labels:Tensor("loss/Cast:0", shape=(?, 20), dtype=float32)


INFO:tensorflow:num_labels:20;logits:Tensor("loss/BiasAdd:0", shape=(?, 20), dtype=float32);labels:Tensor("loss/Cast:0", shape=(?, 20), dtype=float32)


INFO:tensorflow:**** Trainable Variables ****


INFO:tensorflow:**** Trainable Variables ****


mode: infer probabilities: Tensor("loss/Sigmoid:0", shape=(?, 20), dtype=float32)
INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from ./Genres/model.ckpt-1067


INFO:tensorflow:Restoring parameters from ./Genres/model.ckpt-1067


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


genres prediction: 1604.0 correct predictions in 732 test cases


In [68]:
# predict directors from genres

from sklearn.naive_bayes import GaussianNB

train_features = list_of_train_labels
test_features = probabilities
train_labels = []
for idx, row in train.iterrows():
  train_labels.append(row['crew'][0])
actual_labels = []
for idx, row in test.iterrows():
  if len(row['crew']) == 0:
    actual_labels.append("")
  else: 
    actual_labels.append(row['crew'][0])

gnb = GaussianNB().fit(train_features, train_labels)
gnb_predictions = gnb.predict(test_features)

accuracy = gnb.score(test_features, actual_labels)
print(f'director prediction: {accuracy * len(test)} correct out of {len(test)} test cases with accuracy of {accuracy: .9f}')

gnb_df = pd.DataFrame(gnb_predictions)

gnb_df.columns = ['prediction']

actual = test['crew']

gnb_df.insert(0, 'title', np.asarray(test['title']))
gnb_df.insert(0, 'id', np.asarray(test['id']))
gnb_df['actual'] = np.asarray(actual)

gnb_df.to_csv('gnb_prediction_for_director.csv', index=False)


director prediction: 6.0 correct out of 732 test cases with accuracy of  0.008196721


In [30]:
# predicting titles with transformers' question answerer

predicted_titles = []

print('title        predicted_title')
for i in range(5):
  overview = test.loc[i, 'overview']
  answer = get_title(overview)
  predicted_titles.append(answer)
  print('{:<15s}{:<20s}'.format(test.loc[i, "title"], answer))

title        predicted_title
Mumford        psychologist        
Coming Home    Cultural Revolution 
Daylight       Kit Latura          
The Hunt       teacher             
The Departed   syndicate           


In [35]:
cast_prediction = pd.read_csv('bert_cast_prediction.csv')
director_prediction = pd.read_csv('gnb_prediction_for_director.csv')

cast_prediction = cast_prediction[['id', 'title', 'overview', 'genres', 'crew', 'cast', 'prediction']]
cast_prediction.rename({'prediction': 'cast_prediction'}, axis=1, inplace=True)

director_prediction = director_prediction[['id', 'prediction']]
director_prediction.rename({'prediction': 'director_prediction'}, axis=1, inplace=True)

final_result_df = pd.merge(cast_prediction, director_prediction, on='id', how='inner')

final_result_df.to_csv('final_result.csv')

In [36]:

from google.colab import files

!zip -r genres.zip ./Genres
!zip -r cast.zip ./Cast/

#files.download("director_result.zip")


  adding: Genres/ (stored 0%)
  adding: Genres/model.ckpt-1067.index (deflated 69%)
  adding: Genres/events.out.tfevents.1618452437.d197d6ffe297 (deflated 92%)
  adding: Genres/model.ckpt-1067.data-00000-of-00001 (deflated 16%)
  adding: Genres/test.tf_record (deflated 78%)
  adding: Genres/checkpoint (deflated 43%)
  adding: Genres/train.tf_record (deflated 77%)
  adding: Genres/model.ckpt-1067.meta (deflated 92%)
  adding: Genres/graph.pbtxt (deflated 97%)
  adding: Cast/ (stored 0%)
  adding: Cast/model.ckpt-1067.index (deflated 69%)
  adding: Cast/model.ckpt-1067.data-00000-of-00001 (deflated 15%)
  adding: Cast/test.tf_record (deflated 99%)
  adding: Cast/events.out.tfevents.1618451663.d197d6ffe297 (deflated 92%)
  adding: Cast/checkpoint (deflated 43%)
  adding: Cast/train.tf_record (deflated 99%)
  adding: Cast/model.ckpt-1067.meta (deflated 92%)
  adding: Cast/graph.pbtxt (deflated 97%)
