In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.tsatools import detrend
from statsmodels.tsa.stattools import adfuller
%matplotlib inline

In [4]:
## read data
mean_ratings_year = pd.read_csv("./processed_data/mean_ratings_year.csv", index_col=0)
mean_ratings_yearmonth = pd.read_csv("./processed_data/mean_ratings_yearmonth.csv", index_col=0)

In [5]:
mean_ratings_year = pd.pivot_table(mean_ratings_year, values='rating', index=['rating_year'], columns=['genres'])
mean_ratings_yearmonth = pd.pivot_table(mean_ratings_yearmonth, values='rating', index=['rating_date'], columns=['genres'])

### Here I only included genres of drama, action, comedy, and thriller, which are top four receiving most of the ratings.

## Without removing anything

In [6]:
# Temporal resolution: year
model = VAR(mean_ratings_year[["Drama", "Action", "Comedy", "Thriller"]]).fit(maxlags=1)
model.summary()



  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Wed, 16, Sep, 2020
Time:                     14:56:16
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                   -30.7042
Nobs:                     21.0000    HQIC:                  -31.4831
Log likelihood:           233.648    FPE:                1.77623e-14
AIC:                     -31.6990    Det(Omega_mle):     7.55935e-15
--------------------------------------------------------------------
Results for equation Drama
                 coefficient       std. error           t-stat            prob
------------------------------------------------------------------------------
const               0.553985         1.099227            0.504           0.614
L1.Drama            1.330052         1.212677            1.097           0.273
L1.Action           0.180642         0.315400            0.573      

In [7]:
# Temporal resolution: month
model = VAR(mean_ratings_yearmonth[["Drama", "Action", "Comedy", "Thriller"]]).fit(maxlags=1)
model.summary()



  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Wed, 16, Sep, 2020
Time:                     14:56:28
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                   -28.1699
Nobs:                     252.000    HQIC:                  -28.3373
Log likelihood:           2174.41    FPE:                4.40901e-13
AIC:                     -28.4500    Det(Omega_mle):     4.07578e-13
--------------------------------------------------------------------
Results for equation Drama
                 coefficient       std. error           t-stat            prob
------------------------------------------------------------------------------
const               1.708758         0.194049            8.806           0.000
L1.Drama            0.155968         0.145356            1.073           0.283
L1.Action           0.141832         0.112516            1.261      

## Removing trend

### Temporal resolution: year

In [8]:
print(adfuller(mean_ratings_year[["Drama"]]))  # Stationary
print(adfuller(mean_ratings_year[["Action"]]))  # Unstationary
print(adfuller(mean_ratings_year[["Comedy"]]))  # Stationary
print(adfuller(mean_ratings_year[["Thriller"]])) # Stationary

(-10.96035877405361, 8.356428815531135e-20, 9, 12, {'1%': -4.137829282407408, '5%': -3.1549724074074077, '10%': -2.7144769444444443}, -100.89476880707704)
(-1.3197587431843456, 0.6200848870891525, 9, 12, {'1%': -4.137829282407408, '5%': -3.1549724074074077, '10%': -2.7144769444444443}, -44.97347936742901)
(-3.703220535348564, 0.004068081799419482, 9, 12, {'1%': -4.137829282407408, '5%': -3.1549724074074077, '10%': -2.7144769444444443}, -92.19414982406752)
(-4.253418288447936, 0.0005343630361669577, 9, 12, {'1%': -4.137829282407408, '5%': -3.1549724074074077, '10%': -2.7144769444444443}, -67.41205315283426)


In [9]:
print(adfuller(detrend(mean_ratings_year["Action"], order=1, axis=0)))  # now stationary

(-3.133823012375228, 0.024149299254310936, 9, 12, {'1%': -4.137829282407408, '5%': -3.1549724074074077, '10%': -2.7144769444444443}, -61.44795927706495)


In [10]:
mean_ratings_year["Action"] = detrend(mean_ratings_year["Action"], order=1, axis=0)

In [11]:
# Temporal resolution: year
model = VAR(mean_ratings_year[["Drama", "Action", "Comedy", "Thriller"]]).fit(maxlags=1)
model.summary()



  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Wed, 16, Sep, 2020
Time:                     15:00:29
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                   -30.4059
Nobs:                     21.0000    HQIC:                  -31.1847
Log likelihood:           230.516    FPE:                2.39365e-14
AIC:                     -31.4006    Det(Omega_mle):     1.01869e-14
--------------------------------------------------------------------
Results for equation Drama
                 coefficient       std. error           t-stat            prob
------------------------------------------------------------------------------
const               0.850365         1.552959            0.548           0.584
L1.Drama            1.336973         1.225258            1.091           0.275
L1.Action           0.108577         0.255902            0.424      

### Temporal resolution: month (remove trend and seasonality)

In [12]:
print(adfuller(mean_ratings_yearmonth[["Drama"]]))  # Untationary
print(adfuller(mean_ratings_yearmonth[["Action"]]))  # Unstationary
print(adfuller(mean_ratings_yearmonth[["Comedy"]]))  # Unstationary
print(adfuller(mean_ratings_yearmonth[["Thriller"]])) # Unstationary

(-2.5192708700044175, 0.11085564164987549, 6, 246, {'1%': -3.457215237265747, '5%': -2.873361841566324, '10%': -2.5730700760129555}, -815.4149788696141)
(-2.041667897362184, 0.2686071473961835, 4, 248, {'1%': -3.4569962781990573, '5%': -2.8732659015936024, '10%': -2.573018897632674}, -758.1158847818616)
(-2.3375670886757467, 0.16013785694468086, 5, 247, {'1%': -3.457105309726321, '5%': -2.873313676101283, '10%': -2.5730443824681606}, -764.5044921088003)
(-2.2148100909802073, 0.20091422522214586, 4, 248, {'1%': -3.4569962781990573, '5%': -2.8732659015936024, '10%': -2.573018897632674}, -780.3415591878493)


In [23]:
Dif_DT=pd.DataFrame()
for i in ["Drama", "Action", "Comedy", "Thriller"]:
    Dif=mean_ratings_yearmonth[i]-mean_ratings_yearmonth[i].shift(1)
    # Remove the seasonal trend
    Dif_sea=Dif-Dif.shift(12)
    s='Dif_'+i
    Dif_DT[s]=Dif_sea
    print('Difference of '+i)
    print(adfuller(Dif_sea.dropna(inplace=False)))

Difference of Drama
(-6.462294996318811, 1.4332745126068822e-08, 14, 225, {'1%': -3.4597521044060353, '5%': -2.874472927517147, '10%': -2.5736628197530864}, -711.2837068278728)
Difference of Action
(-6.804938988203275, 2.188880452029361e-09, 12, 227, {'1%': -3.4594900381360034, '5%': -2.8743581895178485, '10%': -2.573601605503697}, -651.1026316367721)
Difference of Comedy
(-5.579406192952711, 1.4077840882440738e-06, 13, 226, {'1%': -3.4596204846395824, '5%': -2.8744153028455948, '10%': -2.5736320761218576}, -639.8726218633461)
Difference of Thriller
(-5.581769950039264, 1.3914045125824804e-06, 12, 227, {'1%': -3.4594900381360034, '5%': -2.8743581895178485, '10%': -2.573601605503697}, -663.1912785515266)


In [25]:
# Temporal resolution: month
model = VAR(Dif_DT.dropna(axis=0)).fit(maxlags=1, ic='aic')
model.summary()



  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Wed, 16, Sep, 2020
Time:                     15:10:33
--------------------------------------------------------------------
No. of Equations:         4.00000    BIC:                   -25.6073
Nobs:                     239.000    HQIC:                  -25.7809
Log likelihood:           1758.33    FPE:                5.65686e-12
AIC:                     -25.8982    Det(Omega_mle):     5.20724e-12
--------------------------------------------------------------------
Results for equation Dif_Drama
                     coefficient       std. error           t-stat            prob
----------------------------------------------------------------------------------
const                  -0.000135         0.004101           -0.033           0.974
L1.Dif_Drama           -0.097491         0.128512           -0.759           0.448
L1.Dif_Action          -0.282581         0.13185