In [1]:
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime
import itertools
import matplotlib.pyplot as plt

import aggregator as ag

In [14]:
#data_file_name_review = "distributed_data_collection/databases/review_data_sample.csv"
#data_file_name_book = "distributed_data_collection/databases/book_data_sample.csv"

data_file_name_review = "distributed_data_collection/databases/review_data.csv"
data_file_name_book = "distributed_data_collection/databases/book_data.csv"

start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2020, 11, 29)

book_columns = ["num_reviews", "num_ratings", "avg_rating"]

In [15]:
data_aggregator = ag.Aggregator(data_file_name_review, data_file_name_book, book_columns, start_date, end_date, "month")
data = data_aggregator.aggregate("by_book")

Aggregator Initiated.
Processing Scraper Output...
Scraper Output Processed.
Aggregating Review Data...
Review Data Aggregated.
Merging Book Data...
Book Data Merged.


In [16]:
#print(data.describe())

            book_id  review_count 2018-01  review_count 2018-02  \
count  1.630700e+04          16307.000000          16307.000000   
mean   2.231541e+07              0.766787              0.531857   
std    1.711834e+07              2.445657              1.618575   
min    1.000000e+00              0.000000              0.000000   
25%    2.015556e+06              0.000000              0.000000   
50%    2.384856e+07              0.000000              0.000000   
75%    3.666239e+07              1.000000              1.000000   
max    5.502996e+07             77.000000             60.000000   

       review_count 2018-03  review_count 2018-04  review_count 2018-05  \
count          16307.000000          16307.000000          16307.000000   
mean               0.549212              0.520574              0.512050   
std                1.616661              1.537739              1.512294   
min                0.000000              0.000000              0.000000   
25%                0.

In [4]:
time_periods_post = []

for col in data.columns:
    if "review_count" in col:
        if "2020" in col:
            time_periods_post.append(col)

In [17]:
perc_train = 0.75

data = data.iloc[np.random.permutation(data.index)].reset_index(drop=True) #SHUFFLES DATA

num_observations_total = len(data)
num_observations_train = int(num_observations_total* perc_train)
num_observations_test = num_observations_total - num_observations_train

data_train = data.head(num_observations_train).reset_index(drop = True)
data_test = data.tail(num_observations_test).reset_index(drop = True)

In [20]:
is_log_list = [True, False]
num_periods = len(time_periods_post)
num_logs = len(is_log_list)

num_models_total = num_periods * num_logs
num_models_complete = 0

performance_df = pd.DataFrame(columns = ["post_period", "is_log", "mse", "r2_train", "r2_test", "ar2_train", "ar2_test"])

for i in range(len(time_periods_post)): #ITERATE OVER EACH PERIOD IN THE POST PERIOD 
    
    label = time_periods_post[i]
    
    data_train_period = data_train.copy()
    data_test_period = data_test.copy()
    
    #REMOVE OTHER POST-PERIODS FROM DATA 
    
    for post_period in time_periods_post:
        if post_period != label:
            data_train_period.drop(columns = post_period, inplace = True)
            data_test_period.drop(columns = post_period, inplace = True)
            
    #LOG TRANSFORMATION
    
    for is_log in is_log_list:
        
        impute_log_val = 0.0001
        
        if is_log:
            for df in [data_train_period, data_test_period]:
                df[label] = df[label].apply(lambda lab: np.log(lab) if lab != 0 else impute_log_val)
            
    #CREATE TRAINING & TESTING DATA SPECIFIC TO THAT PERIOD 

        x_train, y_train = data_train_period.drop(label,1), data_train_period[label]
        x_test, y_test = data_test_period.drop(label,1), data_test_period[label]

    #FIT MODEL 

        model = LinearRegression().fit(x_train, y_train)

    ## GET METRICS 
        mse = metrics.mean_squared_error(y_test, model.predict(x_test))

        r2_train = metrics.r2_score(y_train, model.predict(x_train))
        r2_test = metrics.r2_score(y_test, model.predict(x_test))

        n_test, n_train = len(x_test), len(x_train)
        p_test, p_train = len(x_test.columns), len(x_train.columns)

        ar2_train = 1 - (1 - r2_train) * (n_train - 1) / (n_train - p_train - 1)
        ar2_test = 1 - (1 - r2_test) * (n_test - 1) / (n_test - p_train - 1)

        metric_dict = {"post_period": label, "is_log": is_log, "mse": mse, "r2_train": r2_train, "r2_test": r2_test, "ar2_train": ar2_train, "ar2_test": ar2_test}
        performance_df = performance_df.append(metric_dict, ignore_index=True)
        
    ##PRINT UPDATES 
        
        num_models_complete +=1
        
        if (num_models_complete % 5 == 0) or (num_models_complete == num_models_total):
            print("{}/{} models complete".format(num_models_complete, num_models_total))

5/14 models complete
10/14 models complete
14/14 models complete


In [21]:
print(performance_df.round(3))

             post_period is_log    mse  r2_train  r2_test  ar2_train  ar2_test
0   review_count 2020-01   True  0.165     0.514    0.520      0.513     0.516
1   review_count 2020-01  False  0.165     0.514    0.520      0.513     0.516
2   review_count 2020-02   True  0.121     0.481    0.491      0.479     0.487
3   review_count 2020-02  False  0.121     0.481    0.491      0.479     0.487
4   review_count 2020-03   True  0.128     0.481    0.477      0.479     0.473
5   review_count 2020-03  False  0.128     0.481    0.477      0.479     0.473
6   review_count 2020-04   True  0.177     0.491    0.462      0.489     0.458
7   review_count 2020-04  False  0.177     0.491    0.462      0.489     0.458
8   review_count 2020-05   True  0.181     0.480    0.460      0.478     0.456
9   review_count 2020-05  False  0.181     0.480    0.460      0.478     0.456
10  review_count 2020-06   True  0.168     0.487    0.458      0.485     0.454
11  review_count 2020-06  False  0.168     0.487    