# First try - RNN (No sequences with variable length, 1 feature : urlCode)

### In this notebook, I wrote a RNN, using only the urlCode as feature (categorical). With just this feature and without dividing the complete log sequence in sessions, I obtained a % of correct predicted URL between 14 and 20.

In [None]:
# Recurrent Neural Network


# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
import tensorflow as tf

In [None]:
# Importing the data set and splitting in train and test
dataset_train = pd.read_csv('input_data/time_series_all_train.csv', names= ["secFromPrevPage", "urlCode"], header= None)
training_set = dataset_train.iloc[:, 1:2].values

In [None]:
#dataset_train

In [None]:
#training_set

In [None]:
onehotencoder = OneHotEncoder(categories='auto')
training_set = training_set.reshape(-1,1)
print(training_set.shape)


training_set_scaled = onehotencoder.fit_transform(training_set).toarray()
print(training_set_scaled.shape)

In [None]:
'''
# Feature Scaling
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_set)
'''

In [None]:
#training_set_scaled

In [None]:
index_start_seq = 0
index_end_seq = 0
X_lengths = []

for i in range(1,len(training_set)):
    if training_set[i] == 0:
        index_end_seq = i
        len_seq_temp = index_end_seq - index_start_seq
        X_lengths.append(len_seq_temp)
        index_start_seq = index_end_seq
    if i == len(training_set) -1:
        index_end_seq = i + 1
        len_seq_temp = index_end_seq - index_start_seq
        X_lengths.append(len_seq_temp)

In [None]:
max_length = max(X_lengths)
frame_size = int(np.mean(X_lengths))
len_seq = 6

print(max_length,frame_size)
print(X_lengths)
#sequence = tf.placeholder(
#    tf.float64, [None, max_length, frame_size])

In [None]:
print(frame_size, training_set_scaled.shape[0])

In [None]:
# Creating a data structure with 60 timesteps and 1 output
X_train = []
y_train = []
for i in range(len_seq , training_set_scaled.shape[0]):
    X_train.append(training_set_scaled[i - len_seq : i, :])
    y_train.append(training_set_scaled[i, :])
X_train, y_train = np.array(X_train), np.array(y_train)

In [None]:
X_train.shape

In [None]:
#X_train

In [None]:
# Reshaping
#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], ))

In [None]:
# Part 2 - Building the RNN

# Importing the Keras libraries and packages
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
from numpy.testing import assert_allclose

In [None]:
# Initialising the RNN
regressor = Sequential()

regressor.add(LSTM(units = 30, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
regressor.add(Dropout(0.2))

regressor.add(LSTM(units = 30, return_sequences = True))
regressor.add(Dropout(0.2))

regressor.add(LSTM(units = 30, return_sequences = True))
regressor.add(Dropout(0.2))

regressor.add(Flatten())

regressor.add(Dense(activation = 'softmax', units = X_train.shape[2]))
regressor.compile(optimizer = 'adam', loss ='categorical_crossentropy', metrics=['accuracy'])

# define the checkpoint
filepath = "models_trained/model_adam_softmax_binary_entropy.h20"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
regressor.fit(X_train, y_train, epochs = 20, batch_size = 32, callbacks = callbacks_list)

In [None]:
# Part 3 - Making the predictions and visualising the results

dataset_test = pd.read_csv('input_data/time_series_all_test.csv', names= ["secFromPrevPage", "urlCode"], header= None)
real_urls = dataset_test.iloc[:, 1:2].values

In [None]:
real_urls

In [None]:
# Getting the predicted urls
dataset_total = pd.concat((dataset_train['urlCode'], dataset_test['urlCode']), axis = 0)
inputs = dataset_total[len(dataset_total) - len(dataset_test) - len_seq:].values

In [None]:
inputs = inputs.reshape(-1,1)
print(inputs.shape)

In [None]:
inputs = onehotencoder.transform(inputs).toarray()

In [None]:
inputs.shape

In [None]:
X_test = []
for i in range(len_seq, inputs.shape[0]):
    X_test.append(inputs[i-len_seq:i, :])
X_test = np.array(X_test)

In [None]:
X_test.shape

In [None]:
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2]))
predicted_url = regressor.predict(X_test)

In [None]:
predicted_url.shape

In [None]:
predicted_url = onehotencoder.inverse_transform(predicted_url)

In [None]:
from pprint import pprint
count_total = 0
count_equal = 0
prediction_counts = {}
for i in range(0,len(predicted_url)):
    count_total += 1
    if predicted_url[i] == real_urls[i]:
        if prediction_counts.get(predicted_url[i][0]) == None:
            prediction_counts[predicted_url[i][0]] = 1
        else:
            prediction_counts[predicted_url[i][0]] += 1
        count_equal += 1
print("{} correct URL predicted over {} total URLs.".format(count_equal,count_total))
print("Precision: %{}".format(count_equal/count_total))

In [None]:
# Visualising the results
plt.plot(real_urls, color = 'red', label = 'Real URL')
plt.plot(predicted_url, color = 'blue', label = 'Predicted URL')
plt.title('URL Prediction')
plt.xlabel('Time')
plt.ylabel('URL Visited')
plt.legend()
plt.show()