In [1]:
# import basic packagess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import regression models
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ARDRegression
from sklearn.neighbors import KNeighborsRegressor

# set display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# import file
df = pd.read_excel('Apprentice_Chef_Dataset.xlsx')

Dropping unnecessary features

In [2]:
df.drop(columns = ['NAME', 'FIRST_NAME', 'FAMILY_NAME'], inplace = True)

Adding a dummy variable: company account as 1 and individual accocunt as 0

In [3]:
df['Domain'] = df.EMAIL.str.split('@').str[-1]

individual_account = ['gmail.com', 'hotmail.com', 'yahoo.com', 'msn.com', 'aol.com']
df['Company_Account'] = 1
for index in df.index.tolist():
    if df.loc[index, 'Domain'] in individual_account:
        df.loc[index, 'Company_Account'] = 0

In [4]:
df.drop(columns = ['EMAIL', 'Domain'], inplace = True)

Changing a mislabelled feature

In [5]:
df.rename(columns = {'LARGEST_ORDER_SIZE':'AVG_MEALS_ORDERED'}, inplace = True)

Detecting and Removing Outliers

In [6]:
# both observations are likely to be outliers for tech related issues. 
outlier1 = df.loc[df.AVG_TIME_PER_SITE_VISIT > 1400, :].index
outlier2 = df.loc[df.AVG_PREP_VID_TIME > 500, :].index

df.drop(index = outlier1, inplace = True)
df.drop(index = outlier2, inplace = True)

Changing dtype

In [7]:
# converting features into the right dtype
df.REVENUE = df.REVENUE.astype('float')
df.TOTAL_MEALS_ORDERED = df.TOTAL_MEALS_ORDERED.astype('float')
df.AVG_TIME_PER_SITE_VISIT = df.AVG_TIME_PER_SITE_VISIT.astype('float')
df.AVG_PREP_VID_TIME = df.AVG_PREP_VID_TIME.astype('float')
df.TOTAL_PHOTOS_VIEWED = df.TOTAL_PHOTOS_VIEWED.astype('float')

df.CROSS_SELL_SUCCESS = df.CROSS_SELL_SUCCESS.astype('category')
df.MOBILE_NUMBER = df.MOBILE_NUMBER.astype('category')
df.TASTES_AND_PREFERENCES = df.TASTES_AND_PREFERENCES.astype('category')
df.PACKAGE_LOCKER = df.PACKAGE_LOCKER.astype('category')
df.REFRIGERATED_LOCKER = df.REFRIGERATED_LOCKER.astype('category')
df.Company_Account = df.Company_Account.astype('category')

Continuous Variables 

In [8]:
# Revenue
df['Log_REVENUE'] = np.log10(df.REVENUE)

# TOTAL_MEALS_ORDERED
df['more_than_20_total_order'] = 1
for index in df.index:
    if df.loc[index, 'TOTAL_MEALS_ORDERED'] < 20:
        df.loc[index, 'more_than_20_total_order'] = 0

# AVG_TIME_PER_SITE_VISIT
df['Log_AVG_TIME_PER_SITE_VISIT'] = np.log10(df.AVG_TIME_PER_SITE_VISIT)

# AVG_PREP_VID_TIME
df['Log_AVG_PREP_VID_TIME'] = np.log10(df.AVG_PREP_VID_TIME)
        
#TOTAL_PHOTOS_VIEWED
df['Photos_viewed'] = 1
for index in df.index:
    if df.loc[index, 'TOTAL_PHOTOS_VIEWED'] == 0:
        df.loc[index, 'Photos_viewed'] = 0

Interval / Count Variables

In [9]:
# UNIQUE_MEALS_PURCH
df['more_than_2_unique_meals'] = 1
for index in df.index:
    if df.loc[index, 'UNIQUE_MEALS_PURCH'] < 2:
        df.loc[index, 'more_than_2_unique_meals'] = 0

# CONTACTS_W_CUSTOMER_SERVICE
df['Log_CONTACTS_W_CUSTOMER_SERVICE'] = np.log10(df.CONTACTS_W_CUSTOMER_SERVICE)

# AVG_MEALS_ORDERED
df['Log_AVG_MEALS_ORDERED'] = np.log10(df.AVG_MEALS_ORDERED)

# MEDIAN_MEAL_RATING
df['Log_MEDIAN_MEAL_RATING'] = np.log10(df.MEDIAN_MEAL_RATING)

# AVG_CLICKS_PER_VISIT
df['Log_AVG_CLICKS_PER_VISIT'] = np.log10(df.AVG_CLICKS_PER_VISIT)

# WEEKLY_PLAN
df['Weekly_plan_subs'] = 1
for index in df.index:
    if df.loc[index, 'WEEKLY_PLAN'] == 0:
        df.loc[index, 'Weekly_plan_subs'] = 0

New Features

In [10]:
# Addition
df['Total_Cancellations'] = df.CANCELLATIONS_BEFORE_NOON + df.CANCELLATIONS_AFTER_NOON
df['Total_Logins'] = df.PC_LOGINS + df.MOBILE_LOGINS
df['Total_Deliveries'] = df.EARLY_DELIVERIES + df.LATE_DELIVERIES

# Binary
df['Yes_Locker'] = 1
for index in df.index:
    if df.loc[index, 'PACKAGE_LOCKER'] + df.loc[index, 'REFRIGERATED_LOCKER'] == 0:
        df.loc[index, 'Yes_Locker'] = 0

df["Class_Attended"] = 1
for index in df.index:
    if df.loc[index, 'PACKAGE_LOCKER'] + df.loc[index, 'REFRIGERATED_LOCKER'] == 0:
        df.loc[index, 'Yes_Locker'] = 0

# Division
df['Avg_Time_Per_Click'] = df.AVG_TIME_PER_SITE_VISIT / df.AVG_CLICKS_PER_VISIT
df['Avg_Click_Per_Time'] = df.AVG_CLICKS_PER_VISIT / df.AVG_TIME_PER_SITE_VISIT
df['Avg_Photos_Viewed_Per_Order'] = df.TOTAL_PHOTOS_VIEWED / df.TOTAL_MEALS_ORDERED

# Log Transformation
df['Log_Avg_Time_Per_Click'] = np.log10(df.Avg_Time_Per_Click)
df['Log_Avg_Click_Per_Time'] = np.log10(df.Avg_Click_Per_Time)
df['Log_Avg_Photos_Viewed_Per_Order'] = np.log10(df.Avg_Photos_Viewed_Per_Order + 0.01)

Creating Training and Testing datasets

In [11]:
# preparing explanatory variable data
df_data = df.drop(columns = ['REVENUE', 'Log_REVENUE'])

# preparing response variables
df_target = df.loc[ : , 'REVENUE']
log_df_target = df.loc[ : , 'Log_REVENUE']

# selecting OLS X variables for revenue 
OLS_Revenue_variables = ['CROSS_SELL_SUCCESS',
                         'TOTAL_MEALS_ORDERED',
                         'CONTACTS_W_CUSTOMER_SERVICE',
                         'MASTER_CLASSES_ATTENDED',
                         'TOTAL_PHOTOS_VIEWED',
                         'more_than_20_total_order',
                         'Log_AVG_PREP_VID_TIME',
                         'more_than_2_unique_meals',
                         'AVG_MEALS_ORDERED',
                         'MEDIAN_MEAL_RATING',
                         'Avg_Time_Per_Click',
                         'Avg_Photos_Viewed_Per_Order']

# selecting OLS X variables for log revenue 
OLS_Log_Revenue_variables = ['CROSS_SELL_SUCCESS',
                             'TOTAL_MEALS_ORDERED',
                             'CONTACTS_W_CUSTOMER_SERVICE',
                             'AVG_TIME_PER_SITE_VISIT',
                             'AVG_MEALS_ORDERED',
                             'MASTER_CLASSES_ATTENDED',
                             'more_than_20_total_order',
                             'more_than_2_unique_meals',
                             'Class_Attended',
                             'Log_AVG_PREP_VID_TIME',
                             'Log_MEDIAN_MEAL_RATING',
                             'Log_Avg_Photos_Viewed_Per_Order']

# Creating OLS_Revenue dataset
OLS_Revenue = df.loc[:, OLS_Revenue_variables]

# Creating OLS_Log_Revenue dataset
OLS_Log_Revenue = df.loc[:, OLS_Log_Revenue_variables]

# preparing OLS Revenue training and testing sets
X_train_OLS_Revenue, X_test_OLS_Revenue, y_train, y_test = train_test_split(
            OLS_Revenue,
            df_target,
            test_size = 0.25,
            random_state = 219)

# preparing OLS Log Revenue training and testing sets
X_train_OLS_Log_Revenue, X_test_OLS_Log_Revenue, y_train_log, y_test_log = train_test_split(
            OLS_Log_Revenue,
            log_df_target,
            test_size = 0.25,
            random_state = 219)

# preparing FULL training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
            df_data,
            df_target,
            test_size = 0.25,
            random_state = 219)

# preparing FULL Log training and testing sets
X_train, X_test, y_train_log, y_test_log = train_test_split(
            df_data,
            log_df_target,
            test_size = 0.25,
            random_state = 219)

Building a linear regression model

In [12]:
# INSTANTIATING a model
lr = LinearRegression()

# FITTING AND PREDICTING OLS Revenue
lr.fit(X_train_OLS_Revenue, y_train)
OLS_Revenue_pred = lr.predict(X_test_OLS_Revenue)

LR_Revenue_OLS_Train_Score = lr.score(X_train_OLS_Revenue, y_train).round(4)
LR_Revenue_OLS_Test_Score = lr.score(X_test_OLS_Revenue, y_test).round(4)
LR_Revenue_OLS_gap = abs(LR_Revenue_OLS_Train_Score - LR_Revenue_OLS_Test_Score).round(4)

# Coefficients
# zipping each feature name to its coefficient
lr_model_values = zip(X_train_OLS_Revenue.columns,
                      lr.coef_.round(decimals = 2))

# Creating a coefficient list
lr_OLS_Revenue_model_lst = [('intercept', lr.intercept_.round(decimals = 2))]
for val in lr_model_values:
    lr_OLS_Revenue_model_lst.append(val)
    
    
# FITTING AND PREDICTING OLS Log REvenue
lr.fit(X_train_OLS_Log_Revenue, y_train_log)
OLS_Log_Revenue_pred = lr.predict(X_test_OLS_Log_Revenue)

LR_Log_Revenue_OLS_Train_Score = lr.score(X_train_OLS_Log_Revenue, y_train_log).round(4)
LR_Log_Revenue_OLS_Test_Score = lr.score(X_test_OLS_Log_Revenue, y_test_log).round(4)
LR_Log_Revenue_OLS_gap = abs(LR_Log_Revenue_OLS_Train_Score - LR_Log_Revenue_OLS_Test_Score).round(4)

# Coefficients
# zipping each feature name to its coefficient
lr_model_values = zip(X_test_OLS_Log_Revenue.columns,
                      lr.coef_.round(decimals = 2))

# Creating a coefficient list
lr_OLS_Log_Revenue_model_lst = [('intercept', lr.intercept_.round(decimals = 2))]
for val in lr_model_values:
    lr_OLS_Log_Revenue_model_lst.append(val)
    
    
# FITTING and PREDICTING FULL Revenue
lr.fit(X_train, y_train)
lr_Revenue_pred = lr.predict(X_test)

# Saving scores
LR_Revenue_Train_Score = lr.score(X_train, y_train).round(4)
LR_Revenue_Test_Score = lr.score(X_test, y_test).round(4)
LR_Revenue_gap = abs(LR_Revenue_Train_Score - LR_Revenue_Test_Score).round(4)

# Coefficients
# zipping each feature name to its coefficient
lr_model_values = zip(X_train.columns,
                      lr.coef_.round(decimals = 2))

# Creating a coefficient list
lr_Revenue_model_lst = [('intercept', lr.intercept_.round(decimals = 2))]
for val in lr_model_values:
    lr_Revenue_model_lst.append(val)

# Deleting 0 coefficients
for feature, coefficient in lr_Revenue_model_lst:
        
        if coefficient == 0:
            lr_Revenue_model_lst.remove((feature, coefficient))
    

# FITTING and PREDICTING FULL Log_Revenue
lr.fit(X_train, y_train_log)
lr_Log_Revenue_pred = lr.predict(X_test)

# Saving scores
LR_Log_Revenue_Train_Score = lr.score(X_train, y_train_log).round(4)
LR_Log_Revenue_Test_Score = lr.score(X_test, y_test_log).round(4)
LR_Log_Revenue_gap = abs(LR_Log_Revenue_Train_Score - LR_Log_Revenue_Test_Score).round(4)

# Coefficients
# zipping each feature name to its coefficient
lr_model_values = zip(X_train.columns,
                      lr.coef_.round(decimals = 2))

# Creating a coefficient list
lr_Log_Revenue_model_lst = [('intercept', lr.intercept_.round(decimals = 2))]
for val in lr_model_values:
    lr_Log_Revenue_model_lst.append(val)
    
# Deleting 0 coefficients
for feature, coefficient in lr_Log_Revenue_model_lst:
        
        if coefficient == 0:
            lr_Log_Revenue_model_lst.remove((feature, coefficient))

Building a Lasso model

In [13]:
# INSTANTIATING a model object
lasso = Lasso(alpha = 1.0, normalize = True)

# Fitting and Predicting revenue
lasso.fit(X_train, y_train)
lasso_Revenue_pred = lasso.predict(X_test)

# Saving scores
Lasso_Revenue_Train_Score = lasso.score(X_train, y_train).round(4)
Lasso_Revenue_Test_Score = lasso.score(X_test, y_test).round(4)
Lasso_Revenue_gap = abs(Lasso_Revenue_Train_Score - Lasso_Revenue_Test_Score).round(4)

# Coefficients
# zipping each feature name to its coefficient
lasso_model_values = zip(X_train.columns,
                      lasso.coef_.round(decimals = 2))

# Creating a coefficient list
lasso_Revenue_model_lst = [('intercept', lasso.intercept_.round(decimals = 2))]
for val in lasso_model_values:
    lasso_Revenue_model_lst.append(val)

# Deleting 0 coefficients
for feature, coefficient in lasso_Revenue_model_lst:
        
        if coefficient == 0:
            lasso_Revenue_model_lst.remove((feature, coefficient))

Building an ARD Model

In [14]:
# INSTANTIATING a model object
ard = ARDRegression()

# Fitting and Predicting revenue
ard.fit(X_train, y_train)
ard_Revenue_pred = ard.predict(X_test)

# Saving scores
ard_Revenue_Train_Score = ard.score(X_train, y_train).round(4)
ard_Revenue_Test_Score = ard.score(X_test, y_test).round(4)
ard_Revenue_gap = abs(ard_Revenue_Train_Score - ard_Revenue_Test_Score).round(4)

# Coefficients
# zipping each feature name to its coefficient
ard_model_values = zip(X_train.columns,
                      ard.coef_.round(decimals = 2))

# Creating a coefficient list
ard_Revenue_model_lst = [('intercept', ard.intercept_.round(decimals = 2))]
for val in ard_model_values:
    ard_Revenue_model_lst.append(val)

# Deleting 0 coefficients
for feature, coefficient in ard_Revenue_model_lst:
        
        if coefficient == 0:
            ard_Revenue_model_lst.remove((feature, coefficient))


# FITTING and PREDICTING FULL Log_Revenue
ard.fit(X_train, y_train_log)
ard_Log_Revenue_pred = ard.predict(X_test)

# Saving scores
ard_Log_Revenue_Train_Score = ard.score(X_train, y_train_log).round(4)
ard_Log_Revenue_Test_Score = ard.score(X_test, y_test_log).round(4)
ard_Log_Revenue_gap = abs(ard_Log_Revenue_Train_Score - ard_Log_Revenue_Test_Score).round(4)

# Coefficients
# zipping each feature name to its coefficient
ard_Log_model_values = zip(X_train.columns,
                      ard.coef_.round(decimals = 2))

# Creating a coefficient list
ard_Log_Revenue_model_lst = [('intercept', ard.intercept_.round(decimals = 2))]
for val in ard_Log_model_values:
    ard_Log_Revenue_model_lst.append(val)

# Deleting 0 coefficients
for feature, coefficient in ard_Log_Revenue_model_lst:
        
        if coefficient == 0:
            ard_Log_Revenue_model_lst.remove((feature, coefficient))

Building a KNN model

Standardization

In [15]:
# INSTANTIATING a StandardScaler() object
sc = StandardScaler()

# FITTING and TRANSFORMING
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

# converting scaled data into a DataFrame
X_train_df_sc = pd.DataFrame(X_train_sc)
X_test_df_sc = pd.DataFrame(X_test_sc)

# Adding column names
X_train_df_sc.columns = X_train.columns
X_test_df_sc.columns = X_test.columns

Building a model

In [16]:
# INSTANTIATING a KNN model object
knn_reg = KNeighborsRegressor(algorithm = 'auto',
                              n_neighbors = 11)

# Fitting and Predicting revenue
knn_reg.fit(X_train_df_sc, y_train)
knn_Revenue_pred = knn_reg.predict(X_test_df_sc)

# Saving scores
knn_Revenue_Train_Score = knn_reg.score(X_train_df_sc, y_train).round(4)
knn_Revenue_Test_Score = knn_reg.score(X_test_df_sc, y_test).round(4)
knn_Revenue_gap = abs(knn_Revenue_Train_Score - knn_Revenue_Test_Score).round(4)


# FITTING and PREDICTING FULL Log_Revenue
knn_reg.fit(X_train_df_sc, y_train_log)
knn_Log_Revenue_pred = knn_reg.predict(X_test_df_sc)

# Saving scores
knn_Log_Revenue_Train_Score = knn_reg.score(X_train_df_sc, y_train_log).round(4)
knn_Log_Revenue_Test_Score = knn_reg.score(X_test_df_sc, y_test_log).round(4)
knn_Log_Revenue_gap = abs(knn_Log_Revenue_Train_Score - knn_Log_Revenue_Test_Score).round(4)

Comparing Results

In [17]:
# creating a dictionary for model results
model_performance = {
    
    'Model Type'    : ['OLS_Revenue', 'OLS_Log_Revenue', 'LR_Revenue', 'LR_Log_Revenue', 'Lasso', 'ARD_Revenue', 'ARD_Log_Revenue', 'KNN_Revenue', 'KNN_Log_Revenue'],
           
    'Training' : [LR_Revenue_OLS_Train_Score, LR_Log_Revenue_OLS_Train_Score, LR_Revenue_Train_Score, LR_Log_Revenue_Train_Score, Lasso_Revenue_Train_Score, ard_Revenue_Train_Score, ard_Log_Revenue_Train_Score, knn_Revenue_Train_Score, knn_Log_Revenue_Train_Score],
           
    'Testing'  : [LR_Revenue_OLS_Test_Score, LR_Log_Revenue_OLS_Test_Score, LR_Revenue_Test_Score, LR_Log_Revenue_Test_Score, Lasso_Revenue_Test_Score, ard_Revenue_Test_Score, ard_Log_Revenue_Test_Score, knn_Revenue_Test_Score, knn_Log_Revenue_Test_Score],
                    
    'Train-Test Gap' : [LR_Revenue_OLS_gap, LR_Log_Revenue_OLS_gap, LR_Revenue_gap, LR_Log_Revenue_gap, Lasso_Revenue_gap, ard_Revenue_gap, ard_Log_Revenue_gap, knn_Revenue_gap, knn_Log_Revenue_gap],
                    
    'Model Size' : [len(lr_OLS_Revenue_model_lst), len(lr_OLS_Log_Revenue_model_lst), len(lr_Revenue_model_lst), len(lr_Log_Revenue_model_lst), len(lasso_Revenue_model_lst), len(ard_Revenue_model_lst), len(ard_Log_Revenue_model_lst), 'NA', 'NA'],
                    
    'Model' : [lr_OLS_Revenue_model_lst, lr_OLS_Log_Revenue_model_lst, lr_Revenue_model_lst, lr_Log_Revenue_model_lst, lasso_Revenue_model_lst, ard_Revenue_model_lst, ard_Log_Revenue_model_lst, 'NA', 'NA']}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)
model_performance

Unnamed: 0,Model Type,Training,Testing,Train-Test Gap,Model Size,Model
0,OLS_Revenue,0.7329,0.7368,0.0039,13.0,"[(intercept, -4423.07), (CROSS_SELL_SUCCESS, -..."
1,OLS_Log_Revenue,0.798,0.789,0.009,13.0,"[(intercept, 1.77), (CROSS_SELL_SUCCESS, -0.01..."
2,LR_Revenue,0.7539,0.7754,0.0215,45.0,"[(intercept, 3083.22), (CROSS_SELL_SUCCESS, -7..."
3,LR_Log_Revenue,0.8143,0.8041,0.0102,31.0,"[(intercept, 2.16), (CROSS_SELL_SUCCESS, -0.01..."
4,Lasso,0.7237,0.7312,0.0075,25.0,"[(intercept, 966.23), (TOTAL_MEALS_ORDERED, 4...."
5,ARD_Revenue,0.7501,0.7733,0.0232,29.0,"[(intercept, 401.59), (CROSS_SELL_SUCCESS, -50..."
6,ARD_Log_Revenue,0.7864,0.777,0.0094,29.0,"[(intercept, 1.51), (CROSS_SELL_SUCCESS, -0.01..."
7,KNN_Revenue,0.741,0.6976,0.0434,,
8,KNN_Log_Revenue,0.7879,0.7395,0.0484,,


Predictions

In [18]:
# creating a prediction table for revenue
prediction_results = pd.DataFrame(data = {
    'Revenue'               : y_test,
    'OLS_Revenue'           : OLS_Revenue_pred.round(2),
    'LR_Revenue'            : lr_Revenue_pred.round(2),
    'Lasso'                 : lasso_Revenue_pred.round(2),
    'ARD_Revenue'           : ard_Revenue_pred.round(2),
    'KNN_Revenue'           : knn_Revenue_pred.round(2)
    })

# additing deviaitons
for col in prediction_results.columns[1:]:
    prediction_results[col + '_Deviations'] = prediction_results[col] - prediction_results.Revenue

# creating a prediction table for inverse log revenue  
prediction_results_log = pd.DataFrame(data = {
    'Log_Revenue'           : 10 ** y_test_log,
    'OLS_Log_Revenue'       : (10 ** OLS_Log_Revenue_pred).round(2),
    'LR_Log_Revenue'        : (10 ** lr_Log_Revenue_pred).round(2),
    'ARD_Log_Revenue'       : (10 ** ard_Log_Revenue_pred).round(2),
    'KNN_Log_Revenue'       : (10 ** knn_Log_Revenue_pred).round(2)
    })

# additing deviaitons
for col in prediction_results_log.columns[1:]:
    prediction_results_log[col + '_Deviations'] = prediction_results_log[col] - prediction_results_log.Log_Revenue

In [19]:
prediction_results.head()

Unnamed: 0,Revenue,OLS_Revenue,LR_Revenue,Lasso,ARD_Revenue,KNN_Revenue,OLS_Revenue_Deviations,LR_Revenue_Deviations,Lasso_Deviations,ARD_Revenue_Deviations,KNN_Revenue_Deviations
1458,2955.0,2145.71,2356.29,2166.41,2300.75,2352.23,-809.29,-598.71,-788.59,-654.25,-602.77
1006,1135.0,781.39,746.73,1282.16,839.55,1400.5,-353.61,-388.27,147.16,-295.45,265.5
1708,2050.0,3354.06,3104.41,3214.42,3144.77,2861.55,1304.06,1054.41,1164.42,1094.77,811.55
1649,1799.0,1785.12,1705.6,1823.11,1705.87,1634.09,-13.88,-93.4,24.11,-93.13,-164.91
55,2775.0,1733.34,1675.31,1708.94,1745.85,1427.36,-1041.66,-1099.69,-1066.06,-1029.15,-1347.64


In [20]:
prediction_results_log.head()

Unnamed: 0,Log_Revenue,OLS_Log_Revenue,LR_Log_Revenue,ARD_Log_Revenue,KNN_Log_Revenue,OLS_Log_Revenue_Deviations,LR_Log_Revenue_Deviations,ARD_Log_Revenue_Deviations,KNN_Log_Revenue_Deviations
1458,2955.0,1963.54,2184.06,2284.74,2265.83,-991.46,-770.94,-670.26,-689.17
1006,1135.0,1198.74,1092.28,1099.86,1361.03,63.74,-42.72,-35.14,226.03
1708,2050.0,2645.82,2596.19,2204.11,2558.99,595.82,546.19,154.11,508.99
1649,1799.0,1755.74,1734.44,1781.86,1609.19,-43.26,-64.56,-17.14,-189.81
55,2775.0,1683.82,1702.24,1663.99,1381.16,-1091.18,-1072.76,-1111.01,-1393.84


Conclusion - Selecting OLS_Log_Revenue
Even though its predicting power on the training set is the second highest with 0.798, the stability of the model is very good with 0.009. Moreover the small model size only using 13 independent variables allows easier and simpler explanation of the model. Therefore, OLS_Log_Revenue is selected