In [1]:
# Basic Packages
import pandas as pd
import numpy as np
import scipy
import itertools
from collections import ChainMap

# OLS and 2SLS
import statsmodels
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.api import add_constant

# Linear Models Packages
import linearmodels.iv as iv
from linearmodels.iv import IV2SLS
from linearmodels.iv import compare
from linearmodels import PanelOLS

# Graph Packages
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

#Import MRMR
from mrmr import mrmr_classif

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Import dataframes
def read_oil_files(oil_list, chosen_lag):
    store_dict = {}
    
    for oil_file in oil_list:
        for lag in chosen_lag:
#             file_path = '/content/drive/MyDrive/5102:_Cargill_Stuff/Regression/' + oil_file + '_oil_' + str(lag) + 'm.csv'
            file_path = './price/' + oil_file + '_oil_' + str(lag) + 'm.csv' 
            df_name = oil_file + '_oil_' + str(lag) + 'm_df'
            
            store_dict[(df_name, oil_file, lag)] = pd.read_csv(file_path)
    return store_dict

In [4]:
def extract_variable_values(data_dict, target_var, csv_list):
    var_dict = {}
    col_to_drop = [target_var]
    for csv in csv_list:
        df = data_dict[csv]
        y = df[target_var]
        x = df.drop(columns = col_to_drop)
        var_dict[csv] = [x, y]
    
    return var_dict

In [5]:
def extract_variable_values_mrmr(data_dict, target_var, csv_list, col_to_keep):
    var_dict = {}
    for i in range(len(csv_list)):
        df = data_dict[csv_list[i]]
        y = df[target_var]
        x = df[col_to_keep[i]]
        var_dict[csv_list[i]] = [x, y]
    
    return var_dict

In [6]:
def get_mrmr_features(selected_data_X, selected_y, selected_K):
    selected_features = mrmr_classif(selected_data_X, selected_y, selected_K)
    
    return selected_features

In [7]:
def run_normal_regression(x, y, oil_name, lag_time):
    result_dict = {}
    x_ols = sm.add_constant(x)
    model = sm.OLS(y,x_ols)
    results_ols = model.fit()
    row_name = oil_name + '_' + str(lag_time) + '_OLS'

      #result_list in order --> r-squared, no. of params, oil name, lag time, regularization (Y/N), alpha
    result_list = [results_ols.rsquared_adj, len(results_ols.params), oil_name, lag_time, False, 0]
    result_dict[row_name] = result_list
    return (result_dict, results_ols.params)

In [8]:
def run_regularized_regression(x, y, oil_name, lag_time, ols_params):
    result_dict = {}
    count = 0
    alpha_range = np.arange(0.2, 0.6, 0.1).tolist()

    x_ols = sm.add_constant(x)
    model = sm.OLS(y,x_ols)
    model_fit = model.fit()

    for alpha in alpha_range:
        row_name = oil_name + '_' + str(lag_time) + '_' + 'regOLS_' +str(alpha)
        results_reg_ols = model.fit_regularized(L1_wt=0, alpha=alpha, start_params=ols_params)
        results_reg_ols_fit = sm.regression.linear_model.OLSResults(model, results_reg_ols.params, model_fit.normalized_cov_params)
        count += 1

        #result_list in order --> r-squared, no. of params, oil name, lag time, regularization (Y/N), alpha
        result_list = [results_reg_ols_fit.rsquared_adj, len(results_reg_ols.params), oil_name, lag_time, True, alpha]

        result_dict[row_name] = result_list
    return result_dict

In [9]:
def aggregate_all_fuction(oil_list, time_lag, feature_selection, mrmr_features_no = 42):
    final_ols_dict = {}

    data_dict_clean = read_oil_files(oil_list, time_lag)

    #data_dict_clean = read_oil_files(oil_list, time_lag)
    csv_list = list(data_dict_clean.keys())
    
    if feature_selection == 'Y':
        selected_features = []
        var_extracted = extract_variable_values(data_dict_clean, '2m_close(t-0)_oils', csv_list)
        
        for csv in csv_list:
            features = get_mrmr_features(var_extracted[csv][0], var_extracted[csv][1], mrmr_features_no)
            selected_features.append(features)

        var = extract_variable_values_mrmr(data_dict_clean, '2m_close(t-0)_oils', csv_list, selected_features)
        
    elif feature_selection == 'N':
        var = extract_variable_values(data_dict_clean, '2m_close(t-0)_oils', csv_list)

    for csv in csv_list:
        normal_ols_result = run_normal_regression(var[csv][0], var[csv][1], csv[1], csv[2])
        reg_ols_result = run_regularized_regression(var[csv][0], var[csv][1], csv[1], csv[2], normal_ols_result[1])
        final_ols_result = ChainMap(normal_ols_result[0], reg_ols_result)
        final_ols_dict = ChainMap(final_ols_dict, final_ols_result)
    return final_ols_dict

In [10]:
def run_final_regression(df, selected_features):
    y = df['2m_close(t-0)_oils']
    x = df[selected_features]

    x_ols = sm.add_constant(x)
    model = sm.OLS(y,x_ols)
    results_ols = model.fit()

    return (results_ols.summary())

In [11]:
def remove_features_regress(df, selected_list, remove_list):
    final_selected = []
    for features in selected_list:
        if features not in remove_list:
            final_selected.append(features)
    return (run_final_regression(df, final_selected))

In [12]:
def run_regression_return_df(oil_list, mrmr_features_no = 42):
    time_lag = [1, 2, 3, 6]

    final_df_feature_selection = pd.DataFrame.from_dict(aggregate_all_fuction(oil_list, time_lag, 'Y', mrmr_features_no), orient='index', columns=['rsquared', 'features_len', 'oil_name', 'time_lag', 'regularization', 'alpha'])
    final_df_no_feature_selection = pd.DataFrame.from_dict(aggregate_all_fuction(oil_list, time_lag, 'N', mrmr_features_no), orient='index', columns=['rsquared', 'features_len', 'oil_name', 'time_lag', 'regularization', 'alpha'])
    
    print(final_df_feature_selection)
    print(final_df_no_feature_selection)
    
    return None

In [13]:
#Palm Oil
#Running palm oil with 15 MRMR features
run_regression_return_df(['palm'], 15)

100%|██████████| 15/15 [00:00<00:00, 35.48it/s]
100%|██████████| 15/15 [00:00<00:00, 39.13it/s]
100%|██████████| 15/15 [00:00<00:00, 39.03it/s]
100%|██████████| 15/15 [00:00<00:00, 39.82it/s]


                                   rsquared  features_len oil_name  time_lag  \
palm_6_regOLS_0.2                  0.289757            16     palm         6   
palm_6_regOLS_0.30000000000000004  0.267112            16     palm         6   
palm_6_regOLS_0.4000000000000001   0.248114            16     palm         6   
palm_6_regOLS_0.5000000000000001   0.231006            16     palm         6   
palm_6_OLS                         0.820391            16     palm         6   
palm_3_regOLS_0.2                  0.231487            16     palm         3   
palm_3_regOLS_0.30000000000000004  0.208411            16     palm         3   
palm_3_regOLS_0.4000000000000001   0.191852            16     palm         3   
palm_3_regOLS_0.5000000000000001   0.178105            16     palm         3   
palm_3_OLS                         0.942778            16     palm         3   
palm_2_regOLS_0.2                  0.095379            16     palm         2   
palm_2_regOLS_0.30000000000000004 -0.204

In [14]:
#Palm oil 2 months best with 15 features
#Run final regression code
df_palm = pd.read_csv('./price/palm_oil_2m.csv')
selected_features_palm = ['ethanol(t-2)_sentiment', 'cattle(t-2)_sentiment', 'silver(t-2)_sentiment', '2m_close_pctgrowth_2m(t-0)_oils', '2m_close(t-0)_crude', '2m_openinterest(t-0)_crude', '2m_openinterest_pctgrowth_2m(t-0)_crude', 'quarter', '2m_openinterest(t-0)_oils_log', '2m_openinterest_pctgrowth_2m(t-0)_oils_log', '2m_close_pctgrowth_2m(t-0)_crude_log', '2m_close_pctgrowth_2m(t-2)_crude_log', '2m_close(t-2)_oils_log', '2m_close(t-2)_crude', '2m_close_pctgrowth_2m(t-2)_oils']

run_final_regression(df_palm, selected_features_palm)

#Remove insignificant variables and re run final regression
remove_list_palm = ['ethanol(t-2)_sentiment', 'cattle(t-2)_sentiment', 'silver(t-2)_sentiment', '2m_close(t-0)_crude',
                   '2m_openinterest(t-0)_crude', '2m_openinterest(t-0)_oils_log', '2m_openinterest_pctgrowth_2m(t-0)_oils_log',
                   '2m_close_pctgrowth_2m(t-0)_crude_log', '2m_close_pctgrowth_2m(t-2)_crude_log', '2m_close(t-2)_crude', 'quarter', '2m_close_pctgrowth_2m(t-2)_oils']

remove_features_regress(df_palm, selected_features_palm, remove_list_palm)

0,1,2,3
Dep. Variable:,2m_close(t-0)_oils,R-squared:,0.98
Model:,OLS,Adj. R-squared:,0.979
Method:,Least Squares,F-statistic:,659.9
Date:,"Sat, 02 Apr 2022",Prob (F-statistic):,4.39e-34
Time:,02:39:08,Log-Likelihood:,85.016
No. Observations:,44,AIC:,-162.0
Df Residuals:,40,BIC:,-154.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6753,0.178,3.797,0.000,0.316,1.035
2m_close_pctgrowth_2m(t-0)_oils,0.9720,0.059,16.356,0.000,0.852,1.092
2m_openinterest_pctgrowth_2m(t-0)_crude,-0.0227,0.010,-2.254,0.030,-0.043,-0.002
2m_close(t-2)_oils_log,0.8988,0.028,31.801,0.000,0.842,0.956

0,1,2,3
Omnibus:,55.128,Durbin-Watson:,1.759
Prob(Omnibus):,0.0,Jarque-Bera (JB):,401.591
Skew:,2.951,Prob(JB):,6.25e-88
Kurtosis:,16.572,Cond. No.,217.0


In [15]:
#Palm Kernel Oil
#Running palm oil with 42 MRMR features
run_regression_return_df(['palm_kernel'], 10)

100%|██████████| 10/10 [00:00<00:00, 40.18it/s]
100%|██████████| 10/10 [00:00<00:00, 39.55it/s]
100%|██████████| 10/10 [00:00<00:00, 30.48it/s]
100%|██████████| 10/10 [00:00<00:00, 40.54it/s]


                                           rsquared  features_len  \
palm_kernel_6_regOLS_0.2                 -15.369086            11   
palm_kernel_6_regOLS_0.30000000000000004 -17.783734            11   
palm_kernel_6_regOLS_0.4000000000000001  -19.298835            11   
palm_kernel_6_regOLS_0.5000000000000001  -20.353690            11   
palm_kernel_6_OLS                          0.371494            11   
palm_kernel_3_regOLS_0.2                 -13.039511            11   
palm_kernel_3_regOLS_0.30000000000000004 -14.749300            11   
palm_kernel_3_regOLS_0.4000000000000001  -15.759221            11   
palm_kernel_3_regOLS_0.5000000000000001  -16.427745            11   
palm_kernel_3_OLS                          0.657194            11   
palm_kernel_2_regOLS_0.2                 -13.318783            11   
palm_kernel_2_regOLS_0.30000000000000004 -14.998130            11   
palm_kernel_2_regOLS_0.4000000000000001  -15.963157            11   
palm_kernel_2_regOLS_0.50000000000

In [16]:
#Palm Kernel oil 1 months best with 10 features
#Run final regression code
df_palm_kernel = pd.read_csv('./price/palm_kernel_oil_1m.csv')
selected_features_palm_kernel = ['ethanol(t-1)_sentiment', 'silver(t-1)_sentiment', 'silver_Count(t-1)_sentiment', 'quarter', '2m_close(t-1)_oils_log', '2m_close(t-0)_crude', 'rapeseed oil_Count(t-1)_sentiment', 'cattle(t-1)_sentiment', '2m_close(t-1)_crude', 'coconut oil_Count(t-1)_sentiment']

run_final_regression(df_palm_kernel, selected_features_palm_kernel)

#Remove insignificant variables and re run final regression
remove_list_palm_kernel = ['ethanol(t-1)_sentiment', 'silver(t-1)_sentiment', 'silver_Count(t-1)_sentiment',
                          'quarter', 'rapeseed oil_Count(t-1)_sentiment', 'cattle(t-1)_sentiment', 'coconut oil_Count(t-1)_sentiment']

remove_features_regress(df_palm_kernel, selected_features_palm_kernel, remove_list_palm_kernel)

0,1,2,3
Dep. Variable:,2m_close(t-0)_oils,R-squared:,0.902
Model:,OLS,Adj. R-squared:,0.895
Method:,Least Squares,F-statistic:,125.7
Date:,"Sat, 02 Apr 2022",Prob (F-statistic):,1.04e-20
Time:,02:39:10,Log-Likelihood:,36.383
No. Observations:,45,AIC:,-64.77
Df Residuals:,41,BIC:,-57.54
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0261,0.456,-0.057,0.955,-0.947,0.895
2m_close(t-1)_oils_log,1.0037,0.060,16.682,0.000,0.882,1.125
2m_close(t-0)_crude,0.0071,0.003,2.398,0.021,0.001,0.013
2m_close(t-1)_crude,-0.0069,0.003,-2.334,0.025,-0.013,-0.001

0,1,2,3
Omnibus:,43.333,Durbin-Watson:,1.549
Prob(Omnibus):,0.0,Jarque-Bera (JB):,175.422
Skew:,2.393,Prob(JB):,8.08e-39
Kurtosis:,11.406,Cond. No.,2290.0


In [17]:
#Soybean Oil
#Running palm oil with 10 MRMR features
run_regression_return_df(['soybean'], 10)

100%|██████████| 10/10 [00:00<00:00, 34.81it/s]
100%|██████████| 10/10 [00:00<00:00, 23.90it/s]
100%|██████████| 10/10 [00:00<00:00, 30.99it/s]
100%|██████████| 10/10 [00:00<00:00, 36.10it/s]


                                      rsquared  features_len oil_name  \
soybean_6_regOLS_0.2                  0.015335            11  soybean   
soybean_6_regOLS_0.30000000000000004 -0.339477            11  soybean   
soybean_6_regOLS_0.4000000000000001  -0.670054            11  soybean   
soybean_6_regOLS_0.5000000000000001  -0.969136            11  soybean   
soybean_6_OLS                         0.821752            11  soybean   
soybean_3_regOLS_0.2                  0.236112            11  soybean   
soybean_3_regOLS_0.30000000000000004 -0.130580            11  soybean   
soybean_3_regOLS_0.4000000000000001  -0.468756            11  soybean   
soybean_3_regOLS_0.5000000000000001  -0.772582            11  soybean   
soybean_3_OLS                         0.965792            11  soybean   
soybean_2_regOLS_0.2                  0.290601            11  soybean   
soybean_2_regOLS_0.30000000000000004 -0.105933            11  soybean   
soybean_2_regOLS_0.4000000000000001  -0.466733     

In [18]:
#Soybean oil 2 months best with 10 features
#Run final regression code
df_soybean = pd.read_csv('./price/soybean_oil_2m.csv')
selected_features_soybean = ['ethanol(t-2)_sentiment', '2m_openinterest(t-0)_oils', '2m_close_pctgrowth_2m(t-0)_oils', '2m_openinterest_pctgrowth_2m(t-0)_oils', '2m_close(t-2)_oils_log', 'month', '2m_close_pctgrowth_2m(t-2)_crude_log', '2m_close(t-0)_crude', '2m_close(t-2)_crude']

run_final_regression(df_soybean, selected_features_soybean)

#Remove insignificant variables and re run final regression
remove_list_soybean = ['ethanol(t-2)_sentiment', '2m_openinterest(t-0)_oils', '2m_openinterest_pctgrowth_2m(t-0)_oils',
                      'month', '2m_close_pctgrowth_2m(t-2)_crude_log', '2m_close(t-0)_crude', '2m_close(t-2)_crude']

remove_features_regress(df_soybean, selected_features_soybean, remove_list_soybean)

0,1,2,3
Dep. Variable:,2m_close(t-0)_oils,R-squared:,0.995
Model:,OLS,Adj. R-squared:,0.994
Method:,Least Squares,F-statistic:,3805.0
Date:,"Sat, 02 Apr 2022",Prob (F-statistic):,2.79e-47
Time:,02:39:12,Log-Likelihood:,106.22
No. Observations:,44,AIC:,-206.4
Df Residuals:,41,BIC:,-201.1
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2432,0.045,5.362,0.000,0.152,0.335
2m_close_pctgrowth_2m(t-0)_oils,0.9240,0.035,26.037,0.000,0.852,0.996
2m_close(t-2)_oils_log,0.9302,0.013,72.908,0.000,0.904,0.956

0,1,2,3
Omnibus:,7.334,Durbin-Watson:,1.953
Prob(Omnibus):,0.026,Jarque-Bera (JB):,6.218
Skew:,0.759,Prob(JB):,0.0447
Kurtosis:,4.044,Cond. No.,53.6


In [19]:
#Rapeseed Oil
#Running rapeseed oil with 15 MRMR features
run_regression_return_df(['rape'], 10)

100%|██████████| 10/10 [00:00<00:00, 34.80it/s]
100%|██████████| 10/10 [00:00<00:00, 37.63it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


                                      rsquared  features_len oil_name  \
rape_6_regOLS_0.2                         -inf             1     rape   
rape_6_regOLS_0.30000000000000004         -inf             1     rape   
rape_6_regOLS_0.4000000000000001          -inf             1     rape   
rape_6_regOLS_0.5000000000000001          -inf             1     rape   
rape_6_OLS                                -inf             1     rape   
rape_3_regOLS_0.2                         -inf             1     rape   
rape_3_regOLS_0.30000000000000004         -inf             1     rape   
rape_3_regOLS_0.4000000000000001          -inf             1     rape   
rape_3_regOLS_0.5000000000000001          -inf             1     rape   
rape_3_OLS                                -inf             1     rape   
rape_2_regOLS_0.2                  -359.079460            11     rape   
rape_2_regOLS_0.30000000000000004  -761.258197            11     rape   
rape_2_regOLS_0.4000000000000001  -1296.281536     

In [20]:
#Rapeseed oil 1 month best with 10 features
#Run final regression code
df_rapeseed = pd.read_csv('./price/rape_oil_1m.csv')
selected_features_rapeseed = ['2m_close(t-1)_oils_log', 'coconut oil(t-1)_sentiment', 'ethanol_Count(t-1)_sentiment', 'energy(t-1)_sentiment', 'rapeseed oil(t-1)_sentiment', 'corn(t-1)_sentiment', '2m_close(t-1)_fx', 'palm oil(t-1)_sentiment', 'market(t-1)_sentiment', 'sunflower oil_Count(t-1)_sentiment']

run_final_regression(df_rapeseed, selected_features_rapeseed)

#Remove insignificant variables and re run final regression
remove_list_rapeseed = ['coconut oil(t-1)_sentiment', 'energy(t-1)_sentiment', 'corn(t-1)_sentiment', '2m_close(t-1)_fx',
                       'market(t-1)_sentiment', 'sunflower oil_Count(t-1)_sentiment']

remove_features_regress(df_rapeseed, selected_features_rapeseed, remove_list_rapeseed)

0,1,2,3
Dep. Variable:,2m_close(t-0)_oils,R-squared:,0.753
Model:,OLS,Adj. R-squared:,0.727
Method:,Least Squares,F-statistic:,29.7
Date:,"Sat, 02 Apr 2022",Prob (F-statistic):,2.29e-11
Time:,02:39:13,Log-Likelihood:,194.37
No. Observations:,44,AIC:,-378.7
Df Residuals:,39,BIC:,-369.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.6394,0.449,5.883,0.000,1.732,3.547
2m_close(t-1)_oils_log,0.5964,0.069,8.704,0.000,0.458,0.735
ethanol_Count(t-1)_sentiment,0.0005,0.000,3.067,0.004,0.000,0.001
rapeseed oil(t-1)_sentiment,0.0062,0.003,2.370,0.023,0.001,0.011
palm oil(t-1)_sentiment,-0.0081,0.003,-2.541,0.015,-0.015,-0.002

0,1,2,3
Omnibus:,33.907,Durbin-Watson:,2.057
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148.476
Skew:,-1.672,Prob(JB):,5.74e-33
Kurtosis:,11.355,Cond. No.,10000.0


In [21]:
#Sunflower Oil
#Running sunflower oil with 15 MRMR features
run_regression_return_df(['sunflower'], 20)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


                                         rsquared  features_len   oil_name  \
sunflower_6_regOLS_0.2                 -22.930794             1  sunflower   
sunflower_6_regOLS_0.30000000000000004 -43.961995             1  sunflower   
sunflower_6_regOLS_0.4000000000000001  -67.388455             1  sunflower   
sunflower_6_regOLS_0.5000000000000001  -91.723174             1  sunflower   
sunflower_6_OLS                          0.000000             1  sunflower   
sunflower_3_regOLS_0.2                 -23.323193             1  sunflower   
sunflower_3_regOLS_0.30000000000000004 -44.714287             1  sunflower   
sunflower_3_regOLS_0.4000000000000001  -68.541628             1  sunflower   
sunflower_3_regOLS_0.5000000000000001  -93.292772             1  sunflower   
sunflower_3_OLS                          0.000000             1  sunflower   
sunflower_2_regOLS_0.2                 -23.304936             1  sunflower   
sunflower_2_regOLS_0.30000000000000004 -44.679285             1 