In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime
import itertools

import aggregator as ag

In [2]:
#data_file_name_review = "distributed_data_collection/databases/review_data_sample.csv"
#data_file_name_book = "distributed_data_collection/databases/book_data_sample.csv"

data_file_name_review = "distributed_data_collection/databases/review_data.csv"
data_file_name_book = "distributed_data_collection/databases/book_data.csv"

start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2020, 2, 29)

split_perc = 0.75
label = "review_count"

grain_list = ["day", "week", "month", "quarter"]

book_columns_basic = ["num_reviews", "num_ratings", "avg_rating"]
book_columns_all = ["num_reviews", "num_ratings", "avg_rating", "series", "book_language", "book_author"]

In [3]:
#book_column_lists = [book_columns_basic]

book_column_lists = []

for i in range(1, len(book_columns_all)+1):
    combinations = itertools.combinations(book_columns_all, i)
    for combo in combinations:
        combo = list(combo)
    
        book_column_lists.append(combo)

In [None]:
performance_df = pd.DataFrame(columns = ["grain", "book_columns", "mse", "r2", "ar2"])

num_tests = len(grain_list) * len(book_column_lists)
num_tests_complete = 0

for grain in grain_list:
    for book_column_list in book_column_lists:
        
        ##GET & SPLIT DATA
        data_aggregator = ag.Aggregator(data_file_name_review, data_file_name_book, book_column_list, start_date, end_date, grain, print_updates = False)
        data = data_aggregator.aggregate("by_date")
        x, y = data.drop(label,1), data[label]
        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= split_perc)
        
        ##FIT MODEL
        model = LinearRegression().fit(x_train, y_train)
        
        ## GET METRICS 
        mse = metrics.mean_squared_error(y_test, model.predict(x_test))
        r2 = metrics.r2_score(y_test, model.predict(x_test))
        
        n = len(x_test)
        p = len(x_test.columns)
        
        ar2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        
        book_col_string = str(book_column_list)
        
        metric_dict = {"grain": grain, "book_columns": book_col_string, "mse": mse, "r2": r2, "ar2": ar2}
        performance_df = performance_df.append(metric_dict, ignore_index=True)
        
        num_tests_complete += 1 
        
        if (num_tests_complete % 10 == 0) or (num_tests_complete == num_tests):
            print "{}/{} tests complete".format(num_tests_complete, num_tests)

In [None]:
#print(performance_df)
print(performance_df.describe())