# Neural Netork Tools

## Imports

In [1]:
import numpy as np 
import csv
import pandas as pd

#NN ones
import tensorflow.keras as keras
import tensorflow as tf
from sklearn.model_selection import train_test_split

## Preprocessing

In [2]:
main_df = pd.read_csv('clean_data.csv')
main_df.drop(columns = 'Unnamed: 0', inplace = True)
main_df.dropna(subset = ['runtimeMinutes'], inplace = True)
main_df.reset_index(drop = True, inplace = True)
main_df.fillna(value = 'Unknown', inplace = True)

In [3]:
main_df

Unnamed: 0,primaryTitle,runtimeMinutes,genres,directors,writers,averageRating,numVotes
0,Frivolinas,80.0,Comedy,Arturo Carballo,Unknown,5.6,15
1,Kate & Leopold,118.0,Comedy,James Mangold,Steven Rogers,6.4,76677
2,The Woman with the Knife,80.0,Drama,Bassori Timite,Bassori Timite,6.5,11
3,The Other Side of the Wind,122.0,Drama,Orson Welles,Orson Welles,6.8,5469
4,The Naked Monster,100.0,Comedy,Ted Newsom,Ted Newsom,5.5,250
...,...,...,...,...,...,...,...
114920,Albatross,97.0,Documentary,Chris Jordan,Chris Jordan,8.2,23
114921,9/11: Escape from the Towers,120.0,Documentary,Grace Chapman,Unknown,8.4,37
114922,La vida sense la Sara Amat,74.0,Drama,Laura Jou,Coral Cruz,6.7,77
114923,Drømmeland,72.0,Documentary,Joost van der Wiel,Unknown,6.6,36


In [4]:
main_df.numVotes.describe().apply(lambda x: format(x, 'f'))

count     114925.000000
mean        4785.097350
std        37203.975544
min            5.000000
25%           17.000000
50%           71.000000
75%          405.000000
max      2195241.000000
Name: numVotes, dtype: object

## Selecting adequate data

What's gucci!

As we can see, the number of ratings is skewed towards a lower amount of votes. These ratings with lower amount of votes might not be good for the analysis since most likely is not enough to be a good assesment of the ratings. I will make a cutoff at the 50th percentile in order to eliminate that problem.

In [5]:
cutoff_50th_df = main_df[main_df['numVotes'] >= 71].reset_index(drop = True)
cutoff_50th_df.numVotes.describe().apply(lambda x: format(x, 'f'))

count      57712.000000
mean        9505.572654
std        52072.703031
min           71.000000
25%          159.000000
50%          402.000000
75%         1551.000000
max      2195241.000000
Name: numVotes, dtype: object

In [6]:
cutoff_50th_df

Unnamed: 0,primaryTitle,runtimeMinutes,genres,directors,writers,averageRating,numVotes
0,Kate & Leopold,118.0,Comedy,James Mangold,Steven Rogers,6.4,76677
1,The Other Side of the Wind,122.0,Drama,Orson Welles,Orson Welles,6.8,5469
2,The Naked Monster,100.0,Comedy,Ted Newsom,Ted Newsom,5.5,250
3,Crime and Punishment,126.0,Drama,Menahem Golan,Fyodor Dostoevsky,5.8,618
4,The Wandering Soap Opera,80.0,Comedy,Valeria Sarmiento,Pía Rey,6.7,227
...,...,...,...,...,...,...,...
57707,Pengalila,111.0,Drama,T.V. Chandran,T.V. Chandran,8.8,550
57708,Manoharam,122.0,Comedy,Anvar Sadik,Unknown,6.9,318
57709,Padmavyuhathile Abhimanyu,130.0,Drama,Vineesh Aaradya,Vineesh Aaradya,8.0,263
57710,Sokagin Çocuklari,98.0,Drama,Ahmet Faik Akinci,Ahmet Faik Akinci,6.2,196


## Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
le_gen = LabelEncoder()
le_dir = LabelEncoder()
le_wri = LabelEncoder()

In [8]:
dfle = cutoff_50th_df
dfle.genres = le_gen.fit_transform(dfle.genres)
dfle.directors = le_dir.fit_transform(dfle.directors)
dfle.writers = le_wri.fit_transform(dfle.writers)

In [9]:
X = dfle[['runtimeMinutes', 'genres', 'directors', 'writers']].values
X

array([[1.1800e+02, 5.0000e+00, 1.2161e+04, 3.0473e+04],
       [1.2200e+02, 8.0000e+00, 2.1852e+04, 2.4317e+04],
       [1.0000e+02, 5.0000e+00, 2.8227e+04, 3.1247e+04],
       ...,
       [1.3000e+02, 8.0000e+00, 2.9860e+04, 3.2950e+04],
       [9.8000e+01, 8.0000e+00, 5.0600e+02, 5.4200e+02],
       [7.4000e+01, 8.0000e+00, 1.6714e+04, 6.2650e+03]])

In [10]:
y = dfle.averageRating
y

0        6.4
1        6.8
2        5.5
3        5.8
4        6.7
        ... 
57707    8.8
57708    6.9
57709    8.0
57710    6.2
57711    6.7
Name: averageRating, Length: 57712, dtype: float64

In [11]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[1,2,3])

In [12]:
X = ohe.fit_transform(X).toarray()



In [16]:
np.shape(X)

(57712, 65567)

## Splitting Train / Test

In [62]:
train_X, test_X, train_y, test_y = train_test_split(
    X,
    y,
    test_size=0.10,
    random_state=13
)

## Normalizing Train Data

In [57]:
train_X = tf.keras.utils.normalize(train_X, axis = 1)
test_X = tf.keras.utils.normalize(test_X, axis = 1)

In [61]:
train_X

Unnamed: 0,runtimeMinutes,genres_codes,directors_codes,writers_codes
15942,0.006420,0.000357,0.701650,0.712493
23500,0.004085,0.000403,0.666934,0.745106
27191,0.023377,0.001670,0.674584,0.737826
853,0.008080,0.001347,0.671027,0.741387
52292,0.005190,0.000807,0.518164,0.855265
...,...,...,...,...
33634,0.003751,0.000556,0.669574,0.742736
56848,0.006045,0.000285,0.056916,0.998361
32842,0.003153,0.000166,0.668791,0.743444
47280,0.004535,0.000267,0.666460,0.745527


## Creating the model

In [67]:
# THE MODEL
model = tf.keras.models.Sequential()

# MODEL'S LAYERS

model.add(tf.keras.layers.Flatten()) #input layer

model.add(tf.keras.layers.Dense(  #Dense hidden layer
    100, # Number of nodes
    activation = tf.nn.relu 
))

model.add(tf.keras.layers.Dense(  #Dense hidden layer
    100, # Number of nodes
    activation = tf.nn.relu 
))


model.add(tf.keras.layers.Dense(  #Output layer
    10, # Number of nodes
    activation = tf.nn.softmax  
))

## Crating compiler that will also give loss and accuracy

In [68]:
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

## Fitting Data

In [69]:
model.fit(train_X.values, train_y.values, epochs = 15)

Train on 51940 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15

KeyboardInterrupt: 

## Evaluating

In [60]:
val_loss, val_acc = model.evaluate(test_X.values, test_y.values)
print(f"Loss: {val_loss}")
print(f"Accuracy: {val_acc}")

Loss: 1.7062689866444674
Accuracy: 0.030492030084133148
