In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime
import itertools

import aggregator as ag

In [2]:
#data_file_name_review = "distributed_data_collection/databases/review_data_sample.csv"
#data_file_name_book = "distributed_data_collection/databases/book_data_sample.csv"

data_file_name_review = "distributed_data_collection/databases/review_data.csv"
data_file_name_book = "distributed_data_collection/databases/book_data.csv"

start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2020, 11, 29)

book_columns = ["num_reviews", "num_ratings", "avg_rating"]

In [3]:
data_aggregator = ag.Aggregator(data_file_name_review, data_file_name_book, book_columns, start_date, end_date, "month")
data = data_aggregator.aggregate("by_book")

Aggregator Initiated.
Processing Scraper Output...
Scraper Output Processed.
Aggregating Review Data...
Review Data Aggregated.
Merging Book Data...
Book Data Merged.


In [4]:
time_periods_post = []

for col in data.columns:
    if "review_count" in col:
        if "2020" in col:
            time_periods_post.append(col)

In [5]:
perc_train = 0.75

data = data.iloc[np.random.permutation(data.index)].reset_index(drop=True) #RANDOMIZES DATA

num_observations_total = len(data)
num_observations_train = int(num_observations_total* perc_train)
num_observations_test = num_observations_total - num_observations_train

data_train = data.head(num_observations_train).reset_index(drop = True)
data_test = data.tail(num_observations_test).reset_index(drop = True)

In [7]:
performance_df = pd.DataFrame(columns = ["post_period", "mse", "r2_train", "r2_test", "ar2_train", "ar2_test"])
model_dict = {}

for i in range(len(time_periods_post)): #ITERATE OVER EACH PERIOD IN THE POST PERIOD 
    
    label = time_periods_post[i]
    
    data_train_period = data_train.copy()
    data_test_period = data_test.copy()
    
    #REMOVE OTHER POST-PERIODS FROM DATA 
    
    for post_period in time_periods_post:
        if post_period != label:
            data_train_period.drop(columns = post_period, inplace = True)
            data_test_period.drop(columns = post_period, inplace = True)
            
    #CREATE TRAINING & TESTING DATA SPECIFIC TO THAT PERIOD 

    x_train, y_train = data_train_period.drop(label,1), data_train_period[label]
    x_test, y_test = data_test_period.drop(label,1), data_test_period[label]
    
    #FIT MODEL 
    
    model = LinearRegression().fit(x_train, y_train)
    model_dict[label] = model
        
    ## GET METRICS 
    mse = metrics.mean_squared_error(y_test, model.predict(x_test))
    
    r2_train = metrics.r2_score(y_train, model.predict(x_train))
    r2_test = metrics.r2_score(y_test, model.predict(x_test))
        
    n_test, n_train = len(x_test), len(x_train)
    p_test, p_train = len(x_test.columns), len(x_train.columns)
      
    ar2_train = 1 - (1 - r2_train) * (n_train - 1) / (n_train - p_train - 1)
    ar2_test = 1 - (1 - r2_test) * (n_test - 1) / (n_test - p_train - 1)
        
    metric_dict = {"post_period": label, "mse": mse, "r2_train": r2_train, "r2_test": r2_test, "ar2_train": ar2_train, "ar2_test": ar2_test}
    performance_df = performance_df.append(metric_dict, ignore_index=True)

In [8]:
print(performance_df.round(3))

             post_period    mse  r2_train  r2_test  ar2_train  ar2_test
0   review_count 2020-01  2.310     0.810    0.807      0.810     0.805
1   review_count 2020-02  1.464     0.706    0.714      0.705     0.712
2   review_count 2020-03  1.838     0.697    0.643      0.696     0.640
3   review_count 2020-04  2.487     0.737    0.745      0.736     0.743
4   review_count 2020-05  2.778     0.642    0.726      0.641     0.724
5   review_count 2020-06  3.118     0.544    0.587      0.543     0.585
6   review_count 2020-07  2.658     0.570    0.636      0.569     0.633
7   review_count 2020-08  1.727     0.555    0.677      0.554     0.675
8   review_count 2020-09  1.751     0.547    0.562      0.546     0.559
9   review_count 2020-10  1.578     0.489    0.450      0.487     0.447
10  review_count 2020-11  0.016     0.056    0.002      0.053    -0.005
