In [1]:
# Basic Packages
import pandas as pd
import numpy as np
import scipy
import itertools
from collections import ChainMap

# OLS and 2SLS
import statsmodels
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.api import add_constant

# Linear Models Packages
import linearmodels.iv as iv
from linearmodels.iv import IV2SLS
from linearmodels.iv import compare
from linearmodels import PanelOLS

# Graph Packages
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

#Import MRMR
from mrmr import mrmr_classif

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Import dataframes
def read_oil_files(oil_list, chosen_lag):
    store_dict = {}
    
    for oil_file in oil_list:
        for lag in chosen_lag:
#             file_path = '/content/drive/MyDrive/5102:_Cargill_Stuff/Regression/' + oil_file + '_oil_' + str(lag) + 'm.csv'
            file_path = './quantity/' + oil_file + '_oil_' + str(lag) + 'm.csv' 
            df_name = oil_file + '_oil_' + str(lag) + 'm_df'
            
            store_dict[(df_name, oil_file, lag)] = pd.read_csv(file_path)
    return store_dict

In [4]:
def extract_variable_values(data_dict, target_var, csv_list):
    var_dict = {}
    col_to_drop = [target_var]
    for csv in csv_list:
        df = data_dict[csv]
        y = df[target_var]
        x = df.drop(columns = col_to_drop)
        var_dict[csv] = [x, y]
    
    return var_dict

In [5]:
def extract_variable_values_mrmr(data_dict, target_var, csv_list, col_to_keep):
    var_dict = {}
    for i in range(len(csv_list)):
        df = data_dict[csv_list[i]]
        y = df[target_var]
        x = df[col_to_keep[i]]
        var_dict[csv_list[i]] = [x, y]
    
    return var_dict

In [6]:
def get_mrmr_features(selected_data_X, selected_y, selected_K):
    selected_features = mrmr_classif(selected_data_X, selected_y, selected_K)
    
    return selected_features

In [7]:
def run_normal_regression(x, y, oil_name, lag_time):
    result_dict = {}
    x_ols = sm.add_constant(x)
    model = sm.OLS(y,x_ols)
    results_ols = model.fit()
    row_name = oil_name + '_' + str(lag_time) + '_OLS'

      #result_list in order --> r-squared, no. of params, oil name, lag time, regularization (Y/N), alpha
    result_list = [results_ols.rsquared_adj, len(results_ols.params), oil_name, lag_time, False, 0]
    result_dict[row_name] = result_list
    return (result_dict, results_ols.params)

In [8]:
def run_regularized_regression(x, y, oil_name, lag_time, ols_params):
    result_dict = {}
    count = 0
    alpha_range = np.arange(0.2, 0.6, 0.1).tolist()

    x_ols = sm.add_constant(x)
    model = sm.OLS(y,x_ols)
    model_fit = model.fit()

    for alpha in alpha_range:
        row_name = oil_name + '_' + str(lag_time) + '_' + 'regOLS_' +str(alpha)
        results_reg_ols = model.fit_regularized(L1_wt=0, alpha=alpha, start_params=ols_params)
        results_reg_ols_fit = sm.regression.linear_model.OLSResults(model, results_reg_ols.params, model_fit.normalized_cov_params)
        count += 1

        #result_list in order --> r-squared, no. of params, oil name, lag time, regularization (Y/N), alpha
        result_list = [results_reg_ols_fit.rsquared_adj, len(results_reg_ols.params), oil_name, lag_time, True, alpha]

        result_dict[row_name] = result_list
    return result_dict

In [9]:
def aggregate_all_fuction(oil_list, time_lag, feature_selection, mrmr_features_no = 42):
    final_ols_dict = {}

    data_dict_clean = read_oil_files(oil_list, time_lag)

    #data_dict_clean = read_oil_files(oil_list, time_lag)
    csv_list = list(data_dict_clean.keys())
    
    if feature_selection == 'Y':
        selected_features = []
        var_extracted = extract_variable_values(data_dict_clean, 'sales_quantity_on_month', csv_list)
        
        for csv in csv_list:
            features = get_mrmr_features(var_extracted[csv][0], var_extracted[csv][1], mrmr_features_no)
            selected_features.append(features)
        var = extract_variable_values_mrmr(data_dict_clean, 'sales_quantity_on_month', csv_list, selected_features)
        
    elif feature_selection == 'N':
        var = extract_variable_values(data_dict_clean, 'sales_quantity_on_month', csv_list)

    for csv in csv_list:
        normal_ols_result = run_normal_regression(var[csv][0], var[csv][1], csv[1], csv[2])
        reg_ols_result = run_regularized_regression(var[csv][0], var[csv][1], csv[1], csv[2], normal_ols_result[1])
        final_ols_result = ChainMap(normal_ols_result[0], reg_ols_result)
        final_ols_dict = ChainMap(final_ols_dict, final_ols_result)
    return final_ols_dict

In [10]:
def run_final_regression(df, selected_features):
    y = df['sales_quantity_on_month']
    x = df[selected_features]

    x_ols = sm.add_constant(x)
    model = sm.OLS(y,x_ols)
    results_ols = model.fit()

    return (results_ols.summary())

In [11]:
def remove_features_regress(df, selected_list, remove_list):
    final_selected = []
    for features in selected_list:
        if features not in remove_list:
            final_selected.append(features)
    return (run_final_regression(df, final_selected))

In [12]:
def run_regression_return_df(oil_list, mrmr_features_no = 42):
    time_lag = [1, 2, 3, 6]

    final_df_feature_selection = pd.DataFrame.from_dict(aggregate_all_fuction(oil_list, time_lag, 'Y', mrmr_features_no), orient='index', columns=['rsquared', 'features_len', 'oil_name', 'time_lag', 'regularization', 'alpha'])
    final_df_no_feature_selection = pd.DataFrame.from_dict(aggregate_all_fuction(oil_list, time_lag, 'N', mrmr_features_no), orient='index', columns=['rsquared', 'features_len', 'oil_name', 'time_lag', 'regularization', 'alpha'])
    
    print(final_df_feature_selection)
    print(final_df_no_feature_selection)
    
    return None

In [13]:
#Palm Oil
#Running palm oil with 30 MRMR features
run_regression_return_df(['palm'], 30)

100%|██████████| 30/30 [00:01<00:00, 23.09it/s]
100%|██████████| 30/30 [00:01<00:00, 26.90it/s]
100%|██████████| 30/30 [00:01<00:00, 26.98it/s]
100%|██████████| 30/30 [00:01<00:00, 27.12it/s]


                                   rsquared  features_len oil_name  time_lag  \
palm_6_regOLS_0.2                  0.519947            31     palm         6   
palm_6_regOLS_0.30000000000000004  0.498552            31     palm         6   
palm_6_regOLS_0.4000000000000001   0.479827            31     palm         6   
palm_6_regOLS_0.5000000000000001   0.463461            31     palm         6   
palm_6_OLS                         0.570913            31     palm         6   
palm_3_regOLS_0.2                  0.549294            31     palm         3   
palm_3_regOLS_0.30000000000000004  0.528935            31     palm         3   
palm_3_regOLS_0.4000000000000001   0.510735            31     palm         3   
palm_3_regOLS_0.5000000000000001   0.494646            31     palm         3   
palm_3_OLS                         0.586137            31     palm         3   
palm_2_regOLS_0.2                  0.537879            31     palm         2   
palm_2_regOLS_0.30000000000000004  0.516

In [14]:
#Palm oil 3 months best with 30 features
#Run final regression code
df_palm = pd.read_csv('./quantity/palm_oil_3m.csv')
selected_features_palm = ['sales_on_month_log', 'soy_Count(t-3)_sentiment', 'cattle_Count(t-3)_sentiment', 'coconut oil(t-3)_sentiment', 'month', 'sales_quantity_on_month(t-3)_cust_log', 'ethanol(t-3)_sentiment', 'Unnamed: 0', 'rapeseed oil_Count(t-3)_sentiment', 'commodity(t-3)_sentiment', 'sales_frequency(t-3)_cust_log', 'market(t-3)_sentiment', 'sales_frequency_over_year(t-3)_cust_log', '2m_close(t-3)_oils_log', 'ethanol_Count(t-3)_sentiment', 'sunflower oil(t-3)_sentiment', 'sales_frequency_over_quarter(t-3)_cust_log', '2m_openinterest_pctgrowth_2m(t-3)_fx_log', 'sales_on_month(t-3)_cust_log', '2m_openinterest(t-3)_fx', 's&p_Count(t-3)_sentiment', 'coconut oil_Count(t-3)_sentiment', 'cattle(t-3)_sentiment', '2m_openinterest_pctgrowth_2m(t-3)_oils', 'cumulative_sales_till_month(t-3)_cust_log', 'platinum_Count(t-3)_sentiment', 'soy(t-3)_sentiment', 'palm oil(t-3)_sentiment', '2m_close(t-3)_crude', 'sunflower oil_Count(t-3)_sentiment']

run_final_regression(df_palm, selected_features_palm)

#Remove insignificant variables and re run final regression
remove_list_palm = ['sales_on_month_log', 'soy_Count(t-3)_sentiment', 'cattle_Count(t-3)_sentiment', 'coconut oil(t-3)_sentiment',
'month', 'ethanol(t-3)_sentiment', 'Unnamed: 0',
'rapeseed oil_Count(t-3)_sentiment','sales_frequency(t-3)_cust_log', 'market(t-3)_sentiment',
'sales_frequency_over_year(t-3)_cust_log', '2m_close(t-3)_oils_log', 'ethanol_Count(t-3)_sentiment',
'sunflower oil(t-3)_sentiment', 'sales_frequency_over_quarter(t-3)_cust_log', '2m_openinterest_pctgrowth_2m(t-3)_fx_log', '2m_openinterest(t-3)_fx',
's&p_Count(t-3)_sentiment', 'coconut oil_Count(t-3)_sentiment', 'cattle(t-3)_sentiment', 'palm oil(t-3)_sentiment', 'sunflower oil_Count(t-3)_sentiment', 'platinum_Count(t-3)_sentiment']

remove_features_regress(df_palm, selected_features_palm, remove_list_palm)

0,1,2,3
Dep. Variable:,sales_quantity_on_month,R-squared:,0.301
Model:,OLS,Adj. R-squared:,0.3
Method:,Least Squares,F-statistic:,349.0
Date:,"Sun, 03 Apr 2022",Prob (F-statistic):,0.0
Time:,13:10:03,Log-Likelihood:,-10809.0
No. Observations:,5677,AIC:,21630.0
Df Residuals:,5669,BIC:,21690.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.8579,0.119,15.630,0.000,1.625,2.091
sales_quantity_on_month(t-3)_cust_log,0.4157,0.019,22.143,0.000,0.379,0.453
commodity(t-3)_sentiment,-1.5592,0.433,-3.603,0.000,-2.408,-0.711
sales_on_month(t-3)_cust_log,-0.6960,0.047,-14.764,0.000,-0.788,-0.604
2m_openinterest_pctgrowth_2m(t-3)_oils,0.1567,0.072,2.191,0.029,0.016,0.297
cumulative_sales_till_month(t-3)_cust_log,0.3647,0.012,29.267,0.000,0.340,0.389
soy(t-3)_sentiment,-0.3069,0.098,-3.134,0.002,-0.499,-0.115
2m_close(t-3)_crude,0.0104,0.002,5.730,0.000,0.007,0.014

0,1,2,3
Omnibus:,26.488,Durbin-Watson:,1.258
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30.585
Skew:,-0.108,Prob(JB):,2.28e-07
Kurtosis:,3.287,Cond. No.,1170.0


In [15]:
#Palm Kernel Oil
#Running palm oil with 25 MRMR features
run_regression_return_df(['palm_kernel'], 25)

100%|██████████| 25/25 [00:00<00:00, 32.98it/s]
100%|██████████| 25/25 [00:00<00:00, 35.15it/s]
100%|██████████| 25/25 [00:00<00:00, 36.36it/s]
100%|██████████| 25/25 [00:00<00:00, 35.38it/s]


                                          rsquared  features_len     oil_name  \
palm_kernel_6_regOLS_0.2                  0.547898            26  palm_kernel   
palm_kernel_6_regOLS_0.30000000000000004  0.521682            26  palm_kernel   
palm_kernel_6_regOLS_0.4000000000000001   0.500367            26  palm_kernel   
palm_kernel_6_regOLS_0.5000000000000001   0.482797            26  palm_kernel   
palm_kernel_6_OLS                         0.620026            26  palm_kernel   
palm_kernel_3_regOLS_0.2                  0.572213            26  palm_kernel   
palm_kernel_3_regOLS_0.30000000000000004  0.545200            26  palm_kernel   
palm_kernel_3_regOLS_0.4000000000000001   0.523102            26  palm_kernel   
palm_kernel_3_regOLS_0.5000000000000001   0.504764            26  palm_kernel   
palm_kernel_3_OLS                         0.637739            26  palm_kernel   
palm_kernel_2_regOLS_0.2                  0.574708            26  palm_kernel   
palm_kernel_2_regOLS_0.30000

In [16]:
#Palm Kernel oil 1 months best with 25 features
#Run final regression code
df_palm_kernel = pd.read_csv('./quantity/palm_kernel_oil_1m.csv')
selected_features_palm_kernel = ['sales_on_month_log', 'soy_Count(t-1)_sentiment', 'gas(t-1)_sentiment', 'rapeseed oil(t-1)_sentiment', 'Unnamed: 0', 'corn(t-1)_sentiment', 'sales_quantity_on_month(t-1)_cust_log', '2m_close_pctgrowth_2m(t-1)_crude_log', 'platinum(t-1)_sentiment', 'sales_frequency(t-1)_cust_log', 'silver_Count(t-1)_sentiment', 'sales_frequency_over_year(t-1)_cust_log', 's&p(t-1)_sentiment', 'cattle_Count(t-1)_sentiment', 'sales_frequency_over_quarter(t-1)_cust_log', 'rapeseed oil_Count(t-1)_sentiment', 'coconut oil_Count(t-1)_sentiment', 'sunflower oil(t-1)_sentiment', 'cumulative_sales_till_month(t-1)_cust_log', 'cattle(t-1)_sentiment', 'sales_on_month(t-1)_cust_log', 'platinum_Count(t-1)_sentiment', 'market(t-1)_sentiment', 'palm oil(t-1)_sentiment', 'soy(t-1)_sentiment']

run_final_regression(df_palm_kernel, selected_features_palm_kernel)

#Remove insignificant variables and re run final regression
remove_list_palm_kernel = ['sales_on_month_log', 'soy_Count(t-1)_sentiment', 'gas(t-1)_sentiment', 'rapeseed oil(t-1)_sentiment', '2m_close_pctgrowth_2m(t-1)_crude_log',
                          'platinum(t-1)_sentiment', 'silver_Count(t-1)_sentiment', 'sales_frequency_over_year(t-1)_cust_log', 's&p(t-1)_sentiment',
                          'sales_frequency_over_quarter(t-1)_cust_log', 'rapeseed oil_Count(t-1)_sentiment', 'coconut oil_Count(t-1)_sentiment', 'cattle(t-1)_sentiment',
                          'platinum_Count(t-1)_sentiment', 'market(t-1)_sentiment', 'palm oil(t-1)_sentiment', 'soy(t-1)_sentiment', 'Unnamed: 0']
 
remove_features_regress(df_palm_kernel, selected_features_palm_kernel, remove_list_palm_kernel)

0,1,2,3
Dep. Variable:,sales_quantity_on_month,R-squared:,0.41
Model:,OLS,Adj. R-squared:,0.408
Method:,Least Squares,F-statistic:,200.2
Date:,"Sun, 03 Apr 2022",Prob (F-statistic):,9.36e-226
Time:,13:10:10,Log-Likelihood:,-3849.2
No. Observations:,2026,AIC:,7714.0
Df Residuals:,2018,BIC:,7759.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.1009,0.084,13.115,0.000,0.936,1.265
corn(t-1)_sentiment,-0.4138,0.126,-3.283,0.001,-0.661,-0.167
sales_quantity_on_month(t-1)_cust_log,0.5830,0.036,16.327,0.000,0.513,0.653
sales_frequency(t-1)_cust_log,0.1935,0.137,1.410,0.159,-0.076,0.463
cattle_Count(t-1)_sentiment,0.0212,0.010,2.028,0.043,0.001,0.042
sunflower oil(t-1)_sentiment,-0.7852,0.275,-2.852,0.004,-1.325,-0.245
cumulative_sales_till_month(t-1)_cust_log,0.4670,0.046,10.075,0.000,0.376,0.558
sales_on_month(t-1)_cust_log,-1.0985,0.089,-12.377,0.000,-1.273,-0.924

0,1,2,3
Omnibus:,30.841,Durbin-Watson:,1.275
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.893
Skew:,0.252,Prob(JB):,5.32e-07
Kurtosis:,2.704,Cond. No.,44.0


In [17]:
#Soybean Oil
#Running palm oil with 42 MRMR features
run_regression_return_df(['soybean'], 20)

100%|██████████| 20/20 [00:00<00:00, 30.20it/s]
100%|██████████| 20/20 [00:00<00:00, 22.35it/s]
100%|██████████| 20/20 [00:00<00:00, 29.40it/s]
100%|██████████| 20/20 [00:00<00:00, 34.48it/s]


                                      rsquared  features_len oil_name  \
soybean_6_regOLS_0.2                  0.361639            21  soybean   
soybean_6_regOLS_0.30000000000000004  0.341153            21  soybean   
soybean_6_regOLS_0.4000000000000001   0.321586            21  soybean   
soybean_6_regOLS_0.5000000000000001   0.303519            21  soybean   
soybean_6_OLS                         0.395056            21  soybean   
soybean_3_regOLS_0.2                  0.363861            21  soybean   
soybean_3_regOLS_0.30000000000000004  0.341156            21  soybean   
soybean_3_regOLS_0.4000000000000001   0.320912            21  soybean   
soybean_3_regOLS_0.5000000000000001   0.302999            21  soybean   
soybean_3_OLS                         0.419073            21  soybean   
soybean_2_regOLS_0.2                  0.366923            21  soybean   
soybean_2_regOLS_0.30000000000000004  0.344490            21  soybean   
soybean_2_regOLS_0.4000000000000001   0.324279     

In [18]:
#Soybean oil 1 months best with 20 features
#Run final regression code
df_soybean = pd.read_csv('./quantity/soybean_oil_1m.csv')
selected_features_soybean = ['sales_on_month_log', '2m_openinterest_pctgrowth_2m(t-1)_crude', 'gas_Count(t-1)_sentiment', 'quarter', 'sales_quantity_on_month(t-1)_cust_log', '%sales_on_month_lag_2_log', '2m_close_pctgrowth_2m(t-1)_crude_log', 'rapeseed oil(t-1)_sentiment', 'gold(t-1)_sentiment', 'coconut oil_Count(t-1)_sentiment', 'sales_frequency_over_year(t-1)_cust_log', 'wheat_Count(t-1)_sentiment', 'palm oil_Count(t-1)_sentiment', 'corn(t-1)_sentiment', 'sales_frequency(t-1)_cust_log', 'palm oil(t-1)_sentiment', '2m_close(t-1)_crude', 'sales_on_month(t-1)_cust_log', '%sales_on_month_log']

run_final_regression(df_soybean, selected_features_soybean)

#Remove insignificant variables and re run final regression
remove_list_soybean = ['sales_on_month_log', '2m_openinterest_pctgrowth_2m(t-1)_crude', 'gas_Count(t-1)_sentiment', 'quarter', '%sales_on_month_lag_2_log',
                      '2m_close_pctgrowth_2m(t-1)_crude_log', 'rapeseed oil(t-1)_sentiment', 'gold(t-1)_sentiment', 'coconut oil_Count(t-1)_sentiment',
                      'sales_frequency_over_year(t-1)_cust_log', 'wheat_Count(t-1)_sentiment', 'palm oil_Count(t-1)_sentiment', 'corn(t-1)_sentiment',
                      'palm oil(t-1)_sentiment', '2m_close(t-1)_crude', '%sales_on_month_log']

remove_features_regress(df_soybean, selected_features_soybean, remove_list_soybean)

0,1,2,3
Dep. Variable:,sales_quantity_on_month,R-squared:,0.273
Model:,OLS,Adj. R-squared:,0.272
Method:,Least Squares,F-statistic:,269.6
Date:,"Sun, 03 Apr 2022",Prob (F-statistic):,1.59e-148
Time:,13:10:21,Log-Likelihood:,-3728.5
No. Observations:,2153,AIC:,7465.0
Df Residuals:,2149,BIC:,7488.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.5537,0.052,68.255,0.000,3.452,3.656
sales_quantity_on_month(t-1)_cust_log,0.4171,0.022,18.540,0.000,0.373,0.461
sales_frequency(t-1)_cust_log,0.7220,0.056,12.853,0.000,0.612,0.832
sales_on_month(t-1)_cust_log,-0.8428,0.070,-12.113,0.000,-0.979,-0.706

0,1,2,3
Omnibus:,161.987,Durbin-Watson:,1.101
Prob(Omnibus):,0.0,Jarque-Bera (JB):,236.478
Skew:,0.606,Prob(JB):,4.4599999999999995e-52
Kurtosis:,4.079,Cond. No.,10.2


In [19]:
#Coconut Oil
#Running coconut oil with 20 MRMR features
run_regression_return_df(['coconut'], 20)

100%|██████████| 20/20 [00:00<00:00, 31.49it/s]
100%|██████████| 20/20 [00:00<00:00, 29.11it/s]
100%|██████████| 20/20 [00:00<00:00, 32.47it/s]
100%|██████████| 20/20 [00:00<00:00, 33.33it/s]


                                      rsquared  features_len oil_name  \
coconut_6_regOLS_0.2                  0.609298            21  coconut   
coconut_6_regOLS_0.30000000000000004  0.601675            21  coconut   
coconut_6_regOLS_0.4000000000000001   0.595995            21  coconut   
coconut_6_regOLS_0.5000000000000001   0.591473            21  coconut   
coconut_6_OLS                         0.634423            21  coconut   
coconut_3_regOLS_0.2                  0.637347            21  coconut   
coconut_3_regOLS_0.30000000000000004  0.630078            21  coconut   
coconut_3_regOLS_0.4000000000000001   0.624797            21  coconut   
coconut_3_regOLS_0.5000000000000001   0.620723            21  coconut   
coconut_3_OLS                         0.661403            21  coconut   
coconut_2_regOLS_0.2                  0.477308            21  coconut   
coconut_2_regOLS_0.30000000000000004  0.477156            21  coconut   
coconut_2_regOLS_0.4000000000000001   0.477018     

In [25]:
#Coconut oil 3 month best with 20 features
#Run final regression code
df_coconut = pd.read_csv('./quantity/coconut_oil_3m.csv')
selected_features_coconut = ['sales_quantity_on_month.1', 'gold_Count(t-3)_sentiment', 'energy(t-3)_sentiment', 'sales_on_month', 'sales_quantity_on_month(t-3)_cust', 'cattle(t-3)_sentiment', 'energy_Count(t-3)_sentiment', 'gas(t-3)_sentiment', 'sales_on_month_log', 'gold(t-3)_sentiment', 'rapeseed oil(t-3)_sentiment', 'sales_frequency_over_quarter(t-3)_cust', 'crude oil(t-3)_sentiment', 'sales_frequency(t-3)_cust', 'crude oil_Count(t-3)_sentiment', 'sales_on_month(t-3)_cust', 'grains(t-3)_sentiment', 'coconut oil_Count(t-3)_sentiment', 'sales_frequency_over_year(t-3)_cust', 'soy(t-3)_sentiment']

run_final_regression(df_coconut, selected_features_coconut)

#Remove insignificant variables and re run final regression
remove_list_coconut = ['sales_quantity_on_month.1', 'sales_on_month', 'sales_on_month_log', 'gold_Count(t-3)_sentiment', 'energy(t-3)_sentiment', 'cattle(t-3)_sentiment', 'energy_Count(t-3)_sentiment',
                      'gold(t-3)_sentiment', 'rapeseed oil(t-3)_sentiment', 'sales_frequency_over_quarter(t-3)_cust', 'crude oil(t-3)_sentiment',
                      'crude oil_Count(t-3)_sentiment', 'grains(t-3)_sentiment', 'coconut oil_Count(t-3)_sentiment', 'sales_frequency_over_year(t-3)_cust', 'soy(t-3)_sentiment']

remove_features_regress(df_coconut, selected_features_coconut, remove_list_coconut)

0,1,2,3
Dep. Variable:,sales_quantity_on_month,R-squared:,0.221
Model:,OLS,Adj. R-squared:,0.22
Method:,Least Squares,F-statistic:,183.0
Date:,"Sun, 03 Apr 2022",Prob (F-statistic):,3.6399999999999996e-138
Time:,13:18:56,Log-Likelihood:,-18655.0
No. Observations:,2582,AIC:,37320.0
Df Residuals:,2577,BIC:,37350.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,79.3419,10.546,7.524,0.000,58.663,100.021
sales_quantity_on_month(t-3)_cust,0.5035,0.026,19.200,0.000,0.452,0.555
gas(t-3)_sentiment,102.6287,34.852,2.945,0.003,34.289,170.969
sales_frequency(t-3)_cust,30.2779,2.036,14.868,0.000,26.285,34.271
sales_on_month(t-3)_cust,-15.1628,1.392,-10.890,0.000,-17.893,-12.433

0,1,2,3
Omnibus:,2303.479,Durbin-Watson:,1.75
Prob(Omnibus):,0.0,Jarque-Bera (JB):,106425.575
Skew:,4.079,Prob(JB):,0.0
Kurtosis:,33.376,Cond. No.,1970.0


In [21]:
#Rapeseed Oil
#Running rapeseed oil with 25 MRMR features
run_regression_return_df(['rape'], 25)

100%|██████████| 25/25 [00:00<00:00, 29.01it/s]
100%|██████████| 25/25 [00:00<00:00, 31.69it/s]
100%|██████████| 25/25 [00:00<00:00, 31.87it/s]
100%|██████████| 25/25 [00:01<00:00, 21.06it/s]


                                   rsquared  features_len oil_name  time_lag  \
rape_6_regOLS_0.2                  0.352918            26     rape         6   
rape_6_regOLS_0.30000000000000004  0.338300            26     rape         6   
rape_6_regOLS_0.4000000000000001   0.323940            26     rape         6   
rape_6_regOLS_0.5000000000000001   0.310357            26     rape         6   
rape_6_OLS                         0.378344            26     rape         6   
rape_3_regOLS_0.2                  0.369471            26     rape         3   
rape_3_regOLS_0.30000000000000004  0.354644            26     rape         3   
rape_3_regOLS_0.4000000000000001   0.340203            26     rape         3   
rape_3_regOLS_0.5000000000000001   0.326653            26     rape         3   
rape_3_OLS                         0.393647            26     rape         3   
rape_2_regOLS_0.2                  0.366739            26     rape         2   
rape_2_regOLS_0.30000000000000004  0.353

In [22]:
#Rapeseed oil 1 month best with 25 features
#Run final regression code
df_rapeseed = pd.read_csv('./quantity/rape_oil_1m.csv')
selected_features_rapeseed = ['sales_on_month_log', 'gold_Count(t-1)_sentiment', 'energy(t-1)_sentiment', 'cattle(t-1)_sentiment', 'sales_quantity_on_month(t-1)_cust_log', 'gas(t-1)_sentiment', 'wheat(t-1)_sentiment', 'soy_Count(t-1)_sentiment', 'sales_frequency(t-1)_cust_log', 'rapeseed oil(t-1)_sentiment', 'gold(t-1)_sentiment', 'crude oil(t-1)_sentiment', 'Unnamed: 0', 'sales_frequency_over_year(t-1)_cust_log', '2m_close_pctgrowth_2m(t-1)_crude_log', 'commodity_Count(t-1)_sentiment', 'platinum(t-1)_sentiment', 'sunflower oil(t-1)_sentiment', 'palm oil(t-1)_sentiment', 'sales_frequency_over_quarter(t-1)_cust_log', 'rapeseed oil_Count(t-1)_sentiment', 'market(t-1)_sentiment', 'ethanol(t-1)_sentiment', '%sales_on_month_lag_2_log', 'sales_on_month(t-1)_cust_log']

run_final_regression(df_rapeseed, selected_features_rapeseed)

#Remove insignificant variables and re run final regression
remove_list_rapeseed = ['sales_on_month_log', 'gold_Count(t-1)_sentiment', 'cattle(t-1)_sentiment', 'wheat(t-1)_sentiment', 'soy_Count(t-1)_sentiment',
                       'sales_frequency(t-1)_cust_log', 'rapeseed oil(t-1)_sentiment', 'gold(t-1)_sentiment', 'crude oil(t-1)_sentiment', 'Unnamed: 0',
                       '2m_close_pctgrowth_2m(t-1)_crude_log', 'commodity_Count(t-1)_sentiment', 'platinum(t-1)_sentiment', 'sunflower oil(t-1)_sentiment',
                       'palm oil(t-1)_sentiment', 'sales_frequency_over_quarter(t-1)_cust_log', 'rapeseed oil_Count(t-1)_sentiment', 'market(t-1)_sentiment',
                       'ethanol(t-1)_sentiment']

remove_features_regress(df_rapeseed, selected_features_rapeseed, remove_list_rapeseed)

0,1,2,3
Dep. Variable:,sales_quantity_on_month,R-squared:,0.293
Model:,OLS,Adj. R-squared:,0.292
Method:,Least Squares,F-statistic:,293.4
Date:,"Sun, 03 Apr 2022",Prob (F-statistic):,2.53e-315
Time:,13:10:52,Log-Likelihood:,-8112.5
No. Observations:,4257,AIC:,16240.0
Df Residuals:,4250,BIC:,16280.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.9403,0.053,74.617,0.000,3.837,4.044
energy(t-1)_sentiment,0.2390,0.110,2.171,0.030,0.023,0.455
sales_quantity_on_month(t-1)_cust_log,0.5127,0.017,30.886,0.000,0.480,0.545
gas(t-1)_sentiment,0.1757,0.132,1.333,0.183,-0.083,0.434
sales_frequency_over_year(t-1)_cust_log,0.7713,0.038,20.386,0.000,0.697,0.846
%sales_on_month_lag_2_log,-6.1302,0.780,-7.856,0.000,-7.660,-4.600
sales_on_month(t-1)_cust_log,-1.1952,0.053,-22.635,0.000,-1.299,-1.092

0,1,2,3
Omnibus:,62.496,Durbin-Watson:,1.169
Prob(Omnibus):,0.0,Jarque-Bera (JB):,84.75
Skew:,0.19,Prob(JB):,3.95e-19
Kurtosis:,3.577,Cond. No.,135.0


In [23]:
#Sunflower Oil
#Running sunflower oil with 25 MRMR features
run_regression_return_df(['sunflower'], 25)

100%|██████████| 25/25 [00:00<00:00, 28.06it/s]
100%|██████████| 25/25 [00:01<00:00, 14.77it/s]
100%|██████████| 25/25 [00:00<00:00, 29.19it/s]
100%|██████████| 25/25 [00:00<00:00, 27.44it/s]


                                        rsquared  features_len   oil_name  \
sunflower_6_regOLS_0.2                  0.409492            26  sunflower   
sunflower_6_regOLS_0.30000000000000004  0.385129            26  sunflower   
sunflower_6_regOLS_0.4000000000000001   0.364255            26  sunflower   
sunflower_6_regOLS_0.5000000000000001   0.346344            26  sunflower   
sunflower_6_OLS                         0.459986            26  sunflower   
sunflower_3_regOLS_0.2                  0.416003            26  sunflower   
sunflower_3_regOLS_0.30000000000000004  0.391379            26  sunflower   
sunflower_3_regOLS_0.4000000000000001   0.370232            26  sunflower   
sunflower_3_regOLS_0.5000000000000001   0.352092            26  sunflower   
sunflower_3_OLS                         0.464679            26  sunflower   
sunflower_2_regOLS_0.2                  0.431527            26  sunflower   
sunflower_2_regOLS_0.30000000000000004  0.405420            26  sunflower   

In [24]:
#Sunflower oil 1 month best with 25 features
#Run final regression code
df_sunflower = pd.read_csv('./quantity/sunflower_oil_1m.csv')
selected_features_sunflower = ['sales_on_month_log', '2m_close(t-1)_crude', 'ethanol(t-1)_sentiment', 'platinum_Count(t-1)_sentiment', 'sales_quantity_on_month(t-1)_cust_log', 'palm oil(t-1)_sentiment', 'commodity(t-1)_sentiment', 'rapeseed oil_Count(t-1)_sentiment', 'Unnamed: 0', 'sunflower oil_Count(t-1)_sentiment', '%sales_on_month_lag_2_log', 'market(t-1)_sentiment', 'sales_frequency(t-1)_cust_log', 'sunflower oil(t-1)_sentiment', 'gas(t-1)_sentiment', 'silver_Count(t-1)_sentiment', 'sales_frequency_over_quarter(t-1)_cust_log', 'rapeseed oil(t-1)_sentiment', 'coconut oil_Count(t-1)_sentiment', 'sales_on_month(t-1)_cust_log', 'platinum(t-1)_sentiment', 'cattle(t-1)_sentiment', 'sales_frequency_over_year(t-1)_cust_log', '2m_close(t-1)_oils_log', 'soy(t-1)_sentiment']

run_final_regression(df_sunflower, selected_features_sunflower)

#Remove insignificant variables and re run final regression
remove_list_sunflower = ['sales_on_month_log', '2m_close(t-1)_crude', 'ethanol(t-1)_sentiment', 'platinum_Count(t-1)_sentiment', 'commodity(t-1)_sentiment',
                        'rapeseed oil_Count(t-1)_sentiment', 'Unnamed: 0', 'sunflower oil_Count(t-1)_sentiment', 'market(t-1)_sentiment',
                        'sales_frequency(t-1)_cust_log', 'sunflower oil(t-1)_sentiment', 'silver_Count(t-1)_sentiment', 'sales_frequency_over_quarter(t-1)_cust_log',
                        'coconut oil_Count(t-1)_sentiment', 'platinum(t-1)_sentiment', 'cattle(t-1)_sentiment', 'sales_frequency_over_year(t-1)_cust_log',
                        'palm oil(t-1)_sentiment', 'soy(t-1)_sentiment']

remove_features_regress(df_sunflower, selected_features_sunflower, remove_list_sunflower)

0,1,2,3
Dep. Variable:,sales_quantity_on_month,R-squared:,0.162
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,180.1
Date:,"Sun, 03 Apr 2022",Prob (F-statistic):,3.1100000000000004e-210
Time:,13:11:14,Log-Likelihood:,-11409.0
No. Observations:,5597,AIC:,22830.0
Df Residuals:,5590,BIC:,22880.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.4326,0.797,-1.797,0.072,-2.996,0.131
sales_quantity_on_month(t-1)_cust_log,0.5462,0.019,28.830,0.000,0.509,0.583
%sales_on_month_lag_2_log,-1.9515,0.668,-2.921,0.004,-3.261,-0.642
gas(t-1)_sentiment,0.1596,0.133,1.204,0.229,-0.100,0.419
rapeseed oil(t-1)_sentiment,0.2003,0.137,1.459,0.145,-0.069,0.469
sales_on_month(t-1)_cust_log,-0.8438,0.049,-17.280,0.000,-0.940,-0.748
2m_close(t-1)_oils_log,0.7561,0.118,6.400,0.000,0.524,0.988

0,1,2,3
Omnibus:,34.597,Durbin-Watson:,1.042
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34.487
Skew:,0.179,Prob(JB):,3.25e-08
Kurtosis:,2.861,Cond. No.,232.0
