In [5]:
%matplotlib inline

import pandas as pd
import numpy as np
import tensorflow as tf
import sys
import time
import math

from Constants import *
from tensorflow.contrib.learn.python.learn.estimators import constants
from tensorflow.contrib.learn.python.learn.estimators.dynamic_rnn_estimator import PredictionType
from sklearn.metrics import mean_squared_error
from tensorflow.contrib.layers import real_valued_column
from math import sqrt
from matplotlib import pyplot as plt
from scipy.interpolate import InterpolatedUnivariateSpline
from statsmodels.tsa.arima_model import ARIMA
from math import floor
import matplotlib.mlab as mlab

In [32]:
SEQUENCE_LENGTH = 8 # Weeks count that is used to predict
PREDICTION_DELTA = 4 # Weeks to predict ahead
VIEWS_SCALE_KOEF = 18

FILE_NAME = "_s:{}_p:{}".format(SEQUENCE_LENGTH, PREDICTION_DELTA)
tf.logging.set_verbosity(tf.logging.ERROR)

def LoadData(name):
    all_training_data = np.load('data/' + name + FILE_NAME + "_training_data.npy" )
    all_lables = np.load('data/' + name + FILE_NAME + "_lables.npy")
    sta = np.vstack(all_training_data)
    
    df = pd.DataFrame(sta, columns=['channel_subscribers', 'views', 'engagements', 'sentiment'])
    df[df < 0] = 0
    df[df.views == 0] = 1
    df[df.channel_subscribers == 0] = 1
    all_lables[all_lables == 0] = 1

    viewsKoef = floor(np.log(df.max()['views']))
    df['views'] = np.log(df['views']) / viewsKoef
    all_lables = np.log(all_lables) / viewsKoef
    df['channel_subscribers'] = np.log(df['channel_subscribers']) / floor(np.log(df.max()['channel_subscribers']))
    df[df.engagements > 1] = 1
    df[df.sentiment > 1] = 1

    all_training_data = df.values
    columnsCount = np.size(all_training_data, 1)
    inputs = np.reshape(all_training_data, (-1, BATCH_SIZE, SEQUENCE_LENGTH, columnsCount))
    output = np.reshape(all_lables, (-1, BATCH_SIZE))

    return inputs, output

In [33]:
def BuildModel(num_units, cell_type, optimizer, learning_rate):
    
    test_inputs, test_outputs = LoadData("TEST")
    columnsCount =  np.size(test_inputs, 3)

    feature_columns = [tf.contrib.layers.real_valued_column("", dimension = columnsCount)]
    estimator = tf.contrib.learn.DynamicRnnEstimator(problem_type = constants.ProblemType.LINEAR_REGRESSION,
                                                     prediction_type = PredictionType.SINGLE_VALUE,
                                                     sequence_feature_columns = feature_columns,
                                                     context_feature_columns = None,
                                                     num_units = num_units,
                                                     cell_type = cell_type, #contrib_rnn.lstm
                                                     optimizer = optimizer,
                                                     learning_rate = learning_rate,
                                                     gradient_clipping_norm=5.0,
                                                     model_dir = "models2/")

    def get_test_inputs():
        inp = tf.constant(test_inputs)
        target = tf.constant(test_outputs)
        return {"": inp}, target

    loss_score = estimator.evaluate(input_fn=get_test_inputs, steps=1)["loss"]

    print("\nTest loss: {0:f}\n".format(loss_score))

    predictions = list(estimator.predict({"" : test_inputs}))

    predicted = [prediction['scores'] for prediction in predictions]  
    
    return (predicted, test_outputs, test_inputs)

In [34]:
predicted, test_outputs, test_inputs = BuildModel(16, 'lstm', 'RMSProp', 1e-3)



Test loss: 0.000327



In [42]:
originalPredicted = np.exp(np.array(predicted) * VIEWS_SCALE_KOEF)
originalTestOutputs = np.exp(test_outputs * VIEWS_SCALE_KOEF)

originalTestOutputs = np.concatenate(originalTestOutputs, axis=0)

rse = ((originalPredicted / originalTestOutputs) - 1)**2
mapeAr = abs(originalTestOutputs - originalPredicted) / originalTestOutputs


print("Original RSE stats. Mean: %.3f, Std: %.3f" % (rse.mean(), sqrt(rse.std())))

print("Original MAPE stats. Mean: %.2f%%, Std: %.2f%% " % (100 * mapeAr.mean(), 100 * mapeAr.std()))

Original RSE stats. Mean: 0.215, Std: 5.091
Original MAPE stats. Mean: 18.13%, Std: 42.69% 


In [40]:
rse.mean()

0.21514609728384487

In [28]:
def MakeLastValuePrediction(inputs):
    prediction = []
    for input in inputs:
        lasViews = input[0][7][1]
        prediction.append(lasViews)

    return prediction


In [29]:
def MakePolifitPrediction(inputs, n):
    prediction = []
    for input in inputs:
        seq = input[0]
        y = seq[:,1]
        x = np.arange(8)

        # calculate polynomial
        z = np.polyfit(x, y, n)
        f = np.poly1d(z)

        pred = f(8)
        prediction.append(pred)

    return prediction

In [30]:
print("Last value prediction calculation")
lastValPreds = MakeLastValuePrediction(test_inputs)
print("Linear aproximation calculation")
polPreds_1 = MakePolifitPrediction(test_inputs, 1)
print("Quadratic aproximation calculation")
polPreds_2 = MakePolifitPrediction(test_inputs, 2)
print("Cubic aproximation calculation")
polPreds_3 = MakePolifitPrediction(test_inputs, 3)
print("Calculation finished")

Last value prediction calculation
Linear aproximation calculation
Quadratic aproximation calculation
Cubic aproximation calculation
Calculation finished


In [31]:
originalLastValPredicted = np.exp(np.array(lastValPreds) * VIEWS_SCALE_KOEF)

originalPol1Predicted = np.exp(np.array(polPreds_1) * VIEWS_SCALE_KOEF)
originalPol2Predicted = np.exp(np.array(polPreds_2) * VIEWS_SCALE_KOEF)
originalPol3Predicted = np.exp(np.array(polPreds_3) * VIEWS_SCALE_KOEF)

lastValEr = sqrt(mean_squared_error(originalLastValPredicted, test_outputs))
Pol1Er = sqrt(mean_squared_error(originalPol1Predicted, test_outputs))
Pol2Er = sqrt(mean_squared_error(originalPol2Predicted, test_outputs))
Pol3Er = sqrt(mean_squared_error(originalPol3Predicted, test_outputs))

print("Last value prediction perfomance: %.2f%%" % (rnnRMSE / lastValEr))
print("Linear perfomance: %.2f%% Quadratic perfomance:%.2f%% Cubic perfomance:%.4f%% " % (rnnRMSE / Pol1Er, rnnRMSE / Pol2Er, rnnRMSE / Pol3Er) )

Last value prediction perfomance: 0.28%
Linear perfomance: 0.23% Quadratic perfomance:0.03% Cubic perfomance:0.0002% 
