In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

print(tf.__version__)

def process_csv(in_file_name, out_file_name, char_to_remove):
    with open(in_file_name, 'r') as infile, open(out_file_name, 'w') as outfile:
        data = infile.read()
        data = data.replace(char_to_remove, "")
        outfile.write(data)
    

fname = "../input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv"
processed_file_name = "/kaggle/working/tmp.csv"
process_csv(fname, processed_file_name, "%")

col_names=["Num", "ID", "Title", "Year", "Age", "IMDb", "Rotten Tomatoes", "Netflix", "Hulu", "Prime Video", "Disney+", "Type", "Directors", "Genres", "Country", "Language", "Runtime"]
dataset = pd.read_csv(processed_file_name, names=col_names, skiprows=[0])

print(dataset.shape)
to_drop = ["Num", "Age", "Netflix", "Hulu", "Prime Video", "Disney+", "Type", "Directors", "Genres", "Country", "Language"]
dataset.drop(to_drop, inplace=True, axis=1)
dataset.dropna(inplace=True)
# summarize the shape of the data with missing rows removed
print(dataset.shape)

print(dataset.describe())
print(dataset.head())
dataset.head()



In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

sns.pairplot(dataset[['IMDb', 'Rotten Tomatoes', 'Runtime']], diag_kind='kde')




In [None]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

to_drop = ["ID", "Title", "Year", "Runtime"]
train_features.drop(to_drop, inplace=True, axis=1)
test_features.drop(to_drop, inplace=True, axis=1)


train_labels = train_features.pop('Rotten Tomatoes')
test_labels = test_features.pop('Rotten Tomatoes')

normalizer = preprocessing.Normalization()
normalizer.adapt(np.array(train_features))

print(normalizer.mean.numpy())

first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

imdb = np.array(train_features['IMDb'])

imdb_normalizer = preprocessing.Normalization(input_shape=[1,])
imdb_normalizer.adapt(imdb)


imdb_model = tf.keras.Sequential([
    imdb_normalizer,
    layers.Dense(units=1)
])

imdb_model.summary()



In [None]:
print(imdb_model.predict(imdb[:10]))

imdb_model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1), loss='mean_absolute_error')

In [None]:
history = imdb_model.fit(train_features['IMDb'], train_labels, epochs=100, verbose=0, validation_split = 0.2)

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [Rotten Tomatoes]')
  plt.legend()
  plt.grid(True)

plot_loss(history)