In [4]:
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime
import itertools
import matplotlib.pyplot as plt

import aggregator as ag

In [5]:
#data_file_name_review = "distributed_data_collection/databases/review_data_sample.csv"
#data_file_name_book = "distributed_data_collection/databases/book_data_sample.csv"

data_file_name_review = "distributed_data_collection/databases/review_data.csv"
data_file_name_book = "distributed_data_collection/databases/book_data.csv"

start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2020, 11, 29)

#book_columns = ["num_reviews", "num_ratings", "avg_rating"] ##THIS IS BASELINE
book_columns = ["num_reviews", "num_ratings", "avg_rating", "book_language", "series", "book_author"]

Theoretically, we could test against multiple grains (ie, day, week, month, quarter). However, I am only working with month. I ruled out the quarterly grain because it wouldn't include enough 2020 time periods to observe a trend in model performance over time. Then because the maximum monthly review counts were in the seventies, I didn't feel I could split those further.

In [6]:
data_aggregator = ag.Aggregator(data_file_name_review, data_file_name_book, book_columns, start_date, end_date, "month")
data = data_aggregator.aggregate("by_book")

Aggregator Initiated.
Processing Scraper Output...
Scraper Output Processed.
Aggregating Review Data...
Review Data Aggregated.
Merging Book Data...
Book Data Merged.


I want to drop binary feature columns which have a small number of values. The linear regression regularization will do a rigorous feature selection, so this step is mostly to reduce the amount of data that it has to process.

In [7]:
columns_dropped = []
k = 2

for col in data.columns:
    
    num_values = data[col].nunique()
    
    if num_values == 1:
        columns_dropped.append(col)
        
    elif num_values == 2:
        if data[col].sum() <=2:
            columns_dropped.append(col)
            
for col in columns_dropped:
    data.drop(columns = col, inplace = True)

I want to get a sense of how many features got dropped. I assume a ton!

In [8]:
print(columns_dropped)

['series_Books2Movies Club:', 'series_The Night Trilogy', 'series_The Naturalist', 'series_does this have any more series?', 'series_Is this the first book on a new series?', 'series_Royally', 'series_Caroline Auden', 'series_Alice Vega', 'series_Charles Jenkins', 'series_Philadelphia Legal', 'series_Adler and Dwyer Book', 'series_Middlegame', 'series_The Hundred-Year-Old Man', 'series_25 likes', 'series_SyFy Miniseries', 'series_The Remnant Chronicles', 'series_Jake Brigance', 'series_Standalone? Spin off? New Series? Tell me something!', 'series_Wool', 'series_Bird Box', 'series_Wrath of the Khans', 'series_Lost', 'series_Captain Chase', 'series_Columbia River', 'series_Rush Duet', 'series_Monsters of Verity', "series_The d\\'Artagnan Romances", 'series_The Extinction Files', 'series_Last Survivors', 'series_The Godfather', 'series_Gibson Vaughn', 'series_Blade Runner', "series_Raven\\'s Shadow", 'series_Shadowhunter Chronicles', 'series_All the Birds in the Sky', 'series_Over the Mo

This process will take all the 2018-2019 periods as features and will use those to predict reviews for each 2020 period. Differences in the overall accuracy of each model as well as shifts in the importance of different features to the model may indicate COVID-related change.

In [9]:
time_periods_post = []

for col in data.columns:
    if "review_count" in col:
        if "2020" in col:
            time_periods_post.append(col)

I want to use the same test/train split for every time period, so I need to make the split before selecting which period will be the feature. 

In [10]:
perc_train = 0.75

data = data.iloc[np.random.permutation(data.index)].reset_index(drop=True) #SHUFFLES DATA

num_observations_total = len(data)
num_observations_train = int(num_observations_total* perc_train)
num_observations_test = num_observations_total - num_observations_train

data_train = data.head(num_observations_train).reset_index(drop = True)
data_test = data.tail(num_observations_test).reset_index(drop = True)

We want to iterate through: logorithmic transformation, regularization type, and alpha value. For the moment, I am turning logorithmic transformation off. It's pretty clear that it has no impact, and I don't want to keep running it while I experiment with other aspects of the modeling process.

In [11]:
alpha_list = list(range(1,11))
regression_types_list = ["linear", "ridge", "lasso"]

#is_log_options = [False, True]
is_log_options = [False] 

Here goes on modeling! We will select the best model for each time period based on MSE.

In [12]:
#JUST FOR PROGRESS PRINTING

num_models = len(regression_types_list)
num_periods = len(time_periods_post)
num_is_log_options = len(is_log_options)
num_alphas = len(alpha_list)

num_models_total = (num_periods * num_is_log_options) * ( (num_alphas * (num_models - 1)) + num_models)
num_models_complete = 0
model_dict = {}

#ACTUAL MODELING
performance_df = pd.DataFrame(columns = ["post_period", "regression_type", "is_log", "alpha", "mse_train", "mse_test", "r2_train", "r2_test"])

for i in range(len(time_periods_post)): #ITERATE OVER EACH PERIOD IN THE POST PERIOD
    
    label = time_periods_post[i]
    
    data_train_period = data_train.copy()
    data_test_period = data_test.copy()
    
    #REMOVE OTHER POST-PERIODS FROM DATA 
    
    for post_period in time_periods_post:

        if post_period != label:
            data_train_period.drop(columns = post_period, inplace = True)
            data_test_period.drop(columns = post_period, inplace = True)
                   
    #CREATE TRAINING & TESTING DATA SPECIFIC TO THAT PERIOD 

    x_train, y_train = data_train_period.drop(label,1), data_train_period[label]
    x_test, y_test = data_test_period.drop(label,1), data_test_period[label]
        
    optimal_model = None
    optimal_regression_type = None
    optimal_mse_test = None
    optimal_is_log = None
    optimal_alpha = None
    is_none = True
    
    for regression_type in regression_types_list:
        for is_log in is_log_options:
            for alpha_val in alpha_list:
                
                if regression_type == "linear":
                    model = LinearRegression()
                    
                if regression_type == "ridge":
                    model = Ridge(normalize = True, alpha = alpha_val) 
                    
                if regression_type == "lasso":
                    model = Lasso(normalize = True, alpha = alpha_val)
            
                model.fit(x_train, y_train)
                mse_test = metrics.mean_squared_error(y_test, model.predict(x_test))

                if is_none:
                    optimal_model = model
                    optimal_mse_test = mse_test
                    optimal_regression_type = regression_type
                    optimal_is_log = is_log
                    
                    if regression_type == "linear":
                        alpha_val = None
                    
                    optimal_alpha = alpha_val

                    is_none = False

                elif mse_test < optimal_mse_test:

                    optimal_model = model
                    optimal_mse_test = mse_test
                    optimal_regression_type = regression_type
                    optimal_is_log = is_log
                    
                    if regression_type == "linear":
                        alpha_val = None
                    
                    optimal_alpha = alpha_val

                #PRINT UPDATES

                num_models_complete +=1

                if (num_models_complete % 10 == 0) or (num_models_complete == num_models_total):
                    print("{}/{} models processed".format(num_models_complete, num_models_total))
                    
                if regression_type == "linear":
                    break
                
    #GET METRICS FOR WINNING MODEL
    
    mse_train = metrics.mean_squared_error(y_test, optimal_model.predict(x_test))
    mse_test = metrics.mean_squared_error(y_test, optimal_model.predict(x_test))

    r2_train = metrics.r2_score(y_train, optimal_model.predict(x_train))
    r2_test = metrics.r2_score(y_test, optimal_model.predict(x_test))
        
    metric_dict = {"post_period": label, "regression_type": optimal_regression_type, "is_log": optimal_is_log, "alpha": optimal_alpha, "mse_train": mse_train, "mse_test": mse_test, "r2_train": r2_train, "r2_test": r2_test}
    performance_df = performance_df.append(metric_dict, ignore_index=True)
    model_dict[label] = optimal_model

10/253 models processed
20/253 models processed
30/253 models processed
40/253 models processed
50/253 models processed
60/253 models processed
70/253 models processed
80/253 models processed
90/253 models processed
100/253 models processed
110/253 models processed
120/253 models processed
130/253 models processed
140/253 models processed
150/253 models processed
160/253 models processed
170/253 models processed
180/253 models processed
190/253 models processed
200/253 models processed
210/253 models processed
220/253 models processed
230/253 models processed


In [13]:
performance_df["post_period"] = performance_df["post_period"].apply(lambda text: text.replace("review_count ", ""))
print(performance_df.round(3))

   post_period regression_type is_log alpha  mse_train  mse_test  r2_train  \
0      2020-01           ridge  False     1      2.483     2.483     0.802   
1      2020-02           ridge  False     1      1.515     1.515     0.731   
2      2020-03           ridge  False     1      1.838     1.838     0.712   
3      2020-04           ridge  False     1      2.707     2.707     0.769   
4      2020-05           ridge  False     2      3.460     3.460     0.685   
5      2020-06           ridge  False     2      3.453     3.453     0.590   
6      2020-07           ridge  False     3      2.601     2.601     0.593   
7      2020-08           ridge  False     2      1.852     1.852     0.623   
8      2020-09           ridge  False     2      1.999     1.999     0.587   
9      2020-10           ridge  False     1      2.132     2.132     0.594   
10     2020-11           ridge  False    10      0.013     0.013     0.061   

    r2_test  
0     0.764  
1     0.653  
2     0.651  
3     0