In [1]:
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime
import itertools
import matplotlib.pyplot as plt

import aggregator as ag

In [2]:
data_file_name_review = "distributed_data_collection/databases/review_data_sample.csv"
data_file_name_book = "distributed_data_collection/databases/book_data_sample.csv"

#data_file_name_review = "distributed_data_collection/databases/review_data.csv"
#data_file_name_book = "distributed_data_collection/databases/book_data.csv"

start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2020, 11, 29)

book_columns = ["num_reviews", "num_ratings", "avg_rating"] ##THIS IS BASELINE
#book_columns = ["num_reviews", "num_ratings", "avg_rating", "book_language", "series", "book_author"]

Theoretically, we could test against multiple grains (ie, day, week, month, quarter). However, I am only working with month. I ruled out the quarterly grain because it wouldn't include enough 2020 time periods to observe a trend in model performance over time. Then because the maximum monthly review counts were in the seventies, I didn't feel I could split those further.

In [3]:
data_aggregator = ag.Aggregator(data_file_name_review, data_file_name_book, book_columns, start_date, end_date, "month")
data = data_aggregator.aggregate("by_book")

Aggregator Initiated.
Processing Scraper Output...
Scraper Output Processed.
Aggregating Review Data...
Review Data Aggregated.
Merging Book Data...
Book Data Merged.


This process will take all the 2018-2019 periods as features and will use those to predict reviews for each 2020 period. Differences in the overall accuracy of each model as well as shifts in the importance of different features to the model may indicate COVID-related change.

In [4]:
time_periods_post = []

for col in data.columns:
    if "review_count" in col:
        if "2020" in col:
            time_periods_post.append(col)

I want to use the same test/train split for every time period, so I need to make the split before selecting which period will be the feature. 

In [5]:
perc_train = 0.75

data = data.iloc[np.random.permutation(data.index)].reset_index(drop=True) #SHUFFLES DATA

num_observations_total = len(data)
num_observations_train = int(num_observations_total* perc_train)
num_observations_test = num_observations_total - num_observations_train

data_train = data.head(num_observations_train).reset_index(drop = True)
data_test = data.tail(num_observations_test).reset_index(drop = True)

We want to iterate through: logorithmic transformation of label and the kind of model used. Eventually, will want to add alpha values and other book meta-data fields.

For the moment, I am turning logorithmic transformation off. It's pretty clear that it has no impact, and I don't want to keep running it while I experiment with other aspects of the modeling process.

In [6]:
regression_types_list = [("linear", LinearRegression()), ("ridge", Ridge(normalize = True)), ("lasso",Lasso(normalize = True))]
is_log_options = [False, True]
#regression_types_list = [("linear", LinearRegression())] ##THIS IS BASELINE

Here goes on modeling! We will select the best model for each time period based on MSE.

In [22]:
#JUST FOR PROGRESS PRINTING

num_models = len(regression_types_list)
num_periods = len(time_periods_post)
num_is_log_options = len(is_log_options)
num_models_total = num_periods *num_models * num_is_log_options
num_models_complete = 0

#ACTUAL MODELING
performance_df = pd.DataFrame(columns = ["post_period", "regression_type", "mse_train", "mse_test", "r2_train", "r2_test"])

for i in range(len(time_periods_post)): #ITERATE OVER EACH PERIOD IN THE POST PERIOD
    
    label = time_periods_post[i]
    
    data_train_period = data_train.copy()
    data_test_period = data_test.copy()
    
    #REMOVE OTHER POST-PERIODS FROM DATA 
    
    for post_period in time_periods_post:

        if post_period != label:
            data_train_period.drop(columns = post_period, inplace = True)
            data_test_period.drop(columns = post_period, inplace = True)
                   
    #CREATE TRAINING & TESTING DATA SPECIFIC TO THAT PERIOD 

    x_train, y_train = data_train_period.drop(label,1), data_train_period[label]
    x_test, y_test = data_test_period.drop(label,1), data_test_period[label]
        
    optimal_model = None
    optimal_regression_type = None
    optimal_mse_test = None
    optimal_is_log = None
    is_none = True
    
    for tup in regression_types_list:
        for is_log in is_log_options:
            
            regression_type, model = tup[0], tup[1]

            model.fit(x_train, y_train)
            mse_test = metrics.mean_squared_error(y_test, model.predict(x_test))

            if is_first:
                optimal_model = model
                optimal_mse_test = mse_test
                optimal_regression_type = regression_type
                optimal_is_log = is_log
                
                is_none = False

            elif mse_test < optimal_mse_test:

                optimal_model = model
                optimal_mse_test = mse_test
                optimal_regression_type = regression_type
                optimal_is_log = is_log

            #PRINT UPDATES

            num_models_complete +=1

            if (num_models_complete % 5 == 0) or (num_models_complete == num_models_total):
                print("{}/{} models processed".format(num_models_complete, num_models_total))
                
    #GET METRICS FOR WINNING MODEL
    
    mse_train = metrics.mean_squared_error(y_test, optimal_model.predict(x_test))
    mse_test = metrics.mean_squared_error(y_test, optimal_model.predict(x_test))

    r2_train = metrics.r2_score(y_train, optimal_model.predict(x_train))
    r2_test = metrics.r2_score(y_test, optimal_model.predict(x_test))
        
    metric_dict = {"post_period": label, "regression_type": optimal_regression_type, "mse_train": mse_train, "mse_test": mse_test, "r2_train": r2_train, "r2_test": r2_test}
    performance_df = performance_df.append(metric_dict, ignore_index=True)

5/42 models processed
10/42 models processed
15/42 models processed
20/42 models processed
25/42 models processed
30/42 models processed
35/42 models processed
40/42 models processed
42/42 models processed


In [23]:
performance_df["post_period"] = performance_df["post_period"].apply(lambda text: text.replace("review_count ", ""))
print(performance_df.round(3))

  post_period regression_type  mse_train  mse_test  r2_train  r2_test
0     2020-01           lasso      0.074     0.074       0.0   -0.008
1     2020-02           lasso      0.005     0.005       0.0    0.000
2     2020-03           lasso      0.002     0.002       0.0    0.000
3     2020-04           lasso      0.110     0.110       0.0   -0.039
4     2020-05           lasso      0.007     0.007       0.0    0.000
5     2020-06           lasso      0.002     0.002       0.0    0.000
6     2020-07           lasso      0.039     0.039       0.0   -0.004
