In [31]:
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime
import itertools
import matplotlib.pyplot as plt

import aggregator as ag

In [14]:
#data_file_name_review = "distributed_data_collection/databases/review_data_sample.csv"
#data_file_name_book = "distributed_data_collection/databases/book_data_sample.csv"

data_file_name_review = "distributed_data_collection/databases/review_data.csv"
data_file_name_book = "distributed_data_collection/databases/book_data.csv"

start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2020, 11, 29)

book_columns = ["num_reviews", "num_ratings", "avg_rating"]

In [15]:
data_aggregator = ag.Aggregator(data_file_name_review, data_file_name_book, book_columns, start_date, end_date, "month")
data = data_aggregator.aggregate("by_book")

Aggregator Initiated.
Processing Scraper Output...
Scraper Output Processed.
Aggregating Review Data...
Review Data Aggregated.
Merging Book Data...
Book Data Merged.


In [45]:
time_periods_post = []

for col in data.columns:
    if "review_count" in col:
        if "2020" in col:
            time_periods_post.append(col)

In [46]:
perc_train = 0.75

data = data.iloc[np.random.permutation(data.index)].reset_index(drop=True) #SHUFFLES DATA

num_observations_total = len(data)
num_observations_train = int(num_observations_total* perc_train)
num_observations_test = num_observations_total - num_observations_train

data_train = data.head(num_observations_train).reset_index(drop = True)
data_test = data.tail(num_observations_test).reset_index(drop = True)

Observation: The maximum review count numbers are too low for log transformations to make a difference. I'm taking it out for the moment just to make things run faster.

In [47]:
#is_log_list = [True, False]
is_log_list = [False]
num_logs = len(is_log_list)

num_periods = len(time_periods_post)

model_list = [("linear", LinearRegression()), ("ridge", Ridge(normalize = True)), ("lasso",Lasso(normalize = True))]
num_models = len(model_list)

num_models_total = num_periods * num_logs *num_models

In [51]:
num_models_complete = 0

performance_df = pd.DataFrame(columns = ["post_period", "model", "is_log", "mse", "r2_train", "r2_test", "ar2_train", "ar2_test"])

for i in range(len(time_periods_post)): #ITERATE OVER EACH PERIOD IN THE POST PERIOD 
    
    label = time_periods_post[i]
    
    data_train_period = data_train.copy()
    data_test_period = data_test.copy()
    
    #REMOVE OTHER POST-PERIODS FROM DATA 
    
    for post_period in time_periods_post:
        if post_period != label:
            data_train_period.drop(columns = post_period, inplace = True)
            data_test_period.drop(columns = post_period, inplace = True)
            
    #LOG TRANSFORMATION
    
    for is_log in is_log_list:
        
        impute_log_val = 0.0001
        
        if is_log:
            for df in [data_train_period, data_test_period]:
                df[label] = df[label].apply(lambda lab: np.log(lab) if lab != 0 else impute_log_val)
            
    #CREATE TRAINING & TESTING DATA SPECIFIC TO THAT PERIOD 

        x_train, y_train = data_train_period.drop(label,1), data_train_period[label]
        x_test, y_test = data_test_period.drop(label,1), data_test_period[label]

    #FIT MODEL
    
        for tup in model_list:
            model_name, model = tup[0], tup[1]

            model.fit(x_train, y_train)

        ## GET METRICS 
            mse = metrics.mean_squared_error(y_test, model.predict(x_test))

            r2_train = metrics.r2_score(y_train, model.predict(x_train))
            r2_test = metrics.r2_score(y_test, model.predict(x_test))

            n_test, n_train = len(x_test), len(x_train)
            p_test, p_train = len(x_test.columns), len(x_train.columns)

            ar2_train = 1 - (1 - r2_train) * (n_train - 1) / (n_train - p_train - 1)
            ar2_test = 1 - (1 - r2_test) * (n_test - 1) / (n_test - p_train - 1)

            metric_dict = {"post_period": label,"model": model_name, "is_log": is_log, "mse": mse, "r2_train": r2_train, "r2_test": r2_test, "ar2_train": ar2_train, "ar2_test": ar2_test}
            performance_df = performance_df.append(metric_dict, ignore_index=True)

        ##PRINT UPDATES 

            num_models_complete +=1

            if (num_models_complete % 5 == 0) or (num_models_complete == num_models_total):
                print("{}/{} models complete".format(num_models_complete, num_models_total))

5/33 models complete
10/33 models complete
15/33 models complete
20/33 models complete
25/33 models complete
30/33 models complete
33/33 models complete


In [50]:
print(performance_df.round(3))

             post_period   model is_log     mse  r2_train  r2_test  ar2_train  \
0   review_count 2020-01  linear  False   2.697     0.814    0.793      0.814   
1   review_count 2020-01   ridge  False   2.990     0.780    0.771      0.779   
2   review_count 2020-01   lasso  False  13.045     0.000   -0.000     -0.002   
3   review_count 2020-02  linear  False   1.927     0.721    0.672      0.720   
4   review_count 2020-02   ridge  False   2.055     0.696    0.650      0.695   
5   review_count 2020-02   lasso  False   5.877     0.000   -0.000     -0.002   
6   review_count 2020-03  linear  False   2.581     0.716    0.591      0.716   
7   review_count 2020-03   ridge  False   2.632     0.696    0.583      0.695   
8   review_count 2020-03   lasso  False   6.322     0.000   -0.000     -0.002   
9   review_count 2020-04  linear  False   3.542     0.757    0.682      0.756   
10  review_count 2020-04   ridge  False   3.578     0.738    0.679      0.737   
11  review_count 2020-04   l