# MEST DAY 5

## Morning Session
### Sequence Models - Time Series Data - RNN - GRU - LSTM

In [None]:
!pip install tensorflow-gpu==2.0.0.beta0

In [None]:
#this version of numpy is required to avoid an error related to numpy defaulting to not allowing pickle files
!pip install numpy==1.16.2

In [None]:
!pip install pandas==0.24

In [1]:
import tensorflow as tf
from tensorflow.keras import datasets, preprocessing, models, layers
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
df = pd.read_csv('/content/gdrive/My Drive/temperature/jena_climate_2009_2016.csv')

In [None]:
df.info()

In [None]:
df[['Date Time']].head()

In [None]:
df = df[['Date Time', 'T (degC)']]
df['DateTime'] = pd.to_datetime(df['Date Time'], format="%d.%m.%Y %H:%M:%S")

In [None]:
df.head()

In [None]:
df.set_index(['DateTime'])

In [None]:
df['T (degC)'].plot()

### Extract Time Series DF

In [None]:
time_series_df = df.set_index(['DateTime'])

In [None]:
time_series_df.info()

In [None]:
time_series_df.drop(['Date Time'], axis=1, inplace=True)

### Resample to Mean Daily Temperature

In [None]:
mean_daily_temp = time_series_df.resample('D').mean()
mean_daily_temp.info()

### Create Training and Test

In [None]:
test_daily_temp = mean_daily_temp['2016']
train_daily_temp = mean_daily_temp['2009':'2015']

In [None]:
train_daily_temp['T (degC)'].plot()

In [None]:
train_daily_temp.head()

### Reset Index

In [None]:
train_daily_temp.reset_index(inplace=True)
train_daily_temp.reset_index(inplace=True)

In [None]:
test_daily_temp.reset_index(inplace=True)
test_daily_temp.reset_index(inplace=True)
test_daily_temp.head()

### Extract Time Steps

In [None]:
train_x = train_daily_temp[['index']].to_numpy()
train_y = train_daily_temp[['T (degC)']].to_numpy()

test_x = test_daily_temp[['index']].to_numpy()
test_y = test_daily_temp[['T (degC)']].to_numpy()

### Create a Generator

In [None]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [None]:
N_INPUT = 5
BATCH_SIZE = 16

In [None]:
train_gen = TimeseriesGenerator(train_x, train_y, length=N_INPUT, sampling_rate=7, batch_size=BATCH_SIZE)

In [None]:
test_gen = TimeseriesGenerator(test_x, test_y, length=N_INPUT, sampling_rate=7, batch_size=BATCH_SIZE)

### Inspect one item

In [None]:
batch_0 = train_gen[0]
x, y = batch_0
print(y.shape)

In [None]:
print(x.shape)

### Dense Model

In [None]:
model = models.Sequential([
    layers.Dense(100, input_shape=(x.shape[1], x.shape[2]), activation='relu'),
    layers.Flatten(),
    layers.Dense(1)
])
print(model.summary())

In [None]:
model.compile(optimizer='adam', loss='mse')

In [None]:
model.fit_generator(train_gen, epochs=5)

In [None]:
example = test_gen[0]
x, y = example
y_pred = model.predict(x)
print('Actual: {} | Predicted: {}'.format(y[0], y_pred[0]))

### Train an RNN

In [None]:
rnn_model = models.Sequential([
    layers.SimpleRNN(100, input_shape=(x.shape[1], x.shape[2]), dropout=0.5, recurrent_dropout=0.5),
    layers.Dense(1)
])
print(rnn_model.summary())

In [None]:
rnn_model.compile(optimizer='adam', loss='mse')
rnn_model.fit_generator(train_gen, epochs=5)

In [None]:
y_pred = rnn_model.predict(x)
print('Actual: {} | Predicted: {}'.format(y[0], y_pred[0]))

### Train an LSTM

In [None]:
lstm_model = models.Sequential([
    layers.Bidirectional(layers.LSTM(100, return_sequences=True, recurrent_dropout=0.5), input_shape=(x.shape[1], x.shape[2])),
    layers.Bidirectional(layers.LSTM(100, recurrent_dropout=0.5)),
    layers.Dense(1)
])
print(lstm_model.summary())

In [None]:
lstm_model.compile(optimizer='adam', loss='mse')
lstm_model.fit_generator(train_gen, epochs=5)

In [None]:
y_pred = lstm_model.predict(x)
print('Actual: {} | Predicted: {}'.format(y[0], y_pred[0]))

## Afternoon Session
### Language Models - Encodings - Embeddings - LSTM - 1D-CNN

### IMDB

In [None]:
imdb = datasets.imdb

#Let's work with a dictionary of 20,000 words
NUM_WORDS = 20000

In [None]:
#load IMDB dataset as lists of integers
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=NUM_WORDS, )

In [None]:
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k: (v+3) for k, v in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
print(' '.join([reverse_word_index[i] for i in train_data[0]]))

In [None]:
print(train_labels[0])

In [None]:
print(train_labels[:5])

In [None]:
print(train_data[0])

* inspect the length of each review

In [None]:
print('Len 0: {}, Len 1: {}, Len 2: {}'.format(len(train_data[0]), len(train_data[1]), len(train_data[2])))

In [None]:
LEN_WORDS = 300
train_data = preprocessing.sequence.pad_sequences(train_data, maxlen=LEN_WORDS)
test_data = preprocessing.sequence.pad_sequences(test_data, maxlen=LEN_WORDS)

In [None]:
print('Len 0: {}, Len 1: {}, Len 2: {}'.format(len(train_data[0]), len(train_data[1]), len(train_data[2])))

### DNN

In [None]:
dense_model = models.Sequential([
    layers.Dense(300, input_shape=(300,), activation='relu'),
    layers.Dense(300, activation='relu'),
    layers.Dense(300, activation='relu'),
    layers.Dense(300, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
print(dense_model.summary())

In [None]:
dense_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dense_model.fit(train_data, train_labels, epochs=10)

### Embeddings

In [None]:
DIMENSION = 16

e_model = models.Sequential([
    layers.Embedding(NUM_WORDS, DIMENSION, input_length=LEN_WORDS),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid')
])
print(e_model.summary())

In [None]:
e_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
e_model.fit(train_data, train_labels, epochs=3)

### Embeddings plus LSTM

In [None]:
l_model = models.Sequential([
    layers.Embedding(NUM_WORDS, DIMENSION, input_length=LEN_WORDS),
    layers.LSTM(DIMENSION * 2),
    layers.Dense(1, activation='sigmoid')
])
print(l_model.summary())

In [None]:
l_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
l_model.fit(train_data, train_labels, epochs=3)

### Embeddings plus Convolution

In [None]:
c_model = models.Sequential([
    layers.Embedding(NUM_WORDS, DIMENSION, input_length=LEN_WORDS),
    layers.SeparableConv1D(filters=10, kernel_size=3, strides=3, padding='same'),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation='sigmoid')
])
print(c_model.summary())

In [None]:
c_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
c_model.fit(train_data, train_labels, epochs=3)

### Amazon Reviews Dataset
* http://jmcauley.ucsd.edu/data/amazon/

In [None]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('/content/gdrive/My Drive/amazon_reviews/reviews_Automotive_5.json.gz')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df = df[['reviewText', 'overall']]

In [None]:
print(df['reviewText'][0])

### Remove Special Characters

In [None]:
import re

def strip_special_chars(st):
    my_pattern = '[A-Za-z0-9./ ]+'
    return ''.join(re.findall(my_pattern, st))

In [None]:
df['cleanReviewText'] = df['reviewText'].apply(strip_special_chars)
print(df['cleanReviewText'][0])

### Convert Ratings to Sentiment

In [None]:
def sentiment(x):
    if x < 3:
        return 0
    else:
        return 1
df['sentiment'] = df['overall'].apply(lambda x: sentiment(x))
df[['cleanReviewText', 'overall', 'sentiment']].head()

### create features and labels

In [None]:
features = df['cleanReviewText']
labels = df['sentiment']

### Get a pre-trained embedding

In [None]:
import tensorflow_hub as hub

In [None]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", output_shape=[50], input_shape=[], dtype=tf.string)

### Add pre-trained layer to a model

In [None]:
from tensorflow import keras

model = keras.Sequential([
    hub_layer,
    keras.layers.Dense(8, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
print(model.summary())

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x=features.to_numpy(), y=labels.to_numpy(), epochs=1, validation_split=0.3)

In [None]:
model = keras.Sequential([
    hub_layer,
    keras.layers.Conv1D(filters=10, kernel_size=3),
    keras.layers.Dense(1, activation='sigmoid')
])
print(model.summary())