# Running the Sales Prediction Models

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

import create_prediction_df as prep
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel

from sklearn import set_config
set_config(display="diagram")

import pickle

In [3]:
def change_dtypes(df):
    df['month'] = df['month'].astype(str)
    df['quarter'] = df['quarter'].astype(str)

    return df

## 1. Palm Oil

In [None]:
# SET ALL PARAMS HERE
# Random state
random_state = 42

# Oils
oil = 'PALM OIL'# Set oil type
oil_file = 'data/palmoil_futures.csv'
remove_scenario = 'purchase' # if you want to predict sales, remove purchase. if predicting purchase, remove sales.
first_date = '2018-01-01' # Get data up to this date for prediction
last_date = '2022-01-01' # Last date at which customer data will cut off, default first_date = '2020-01-01'

# Set futures lookback params
ema_lookback = [2] # 1 month and 2 months
pct_lookback = [2] # 1 month and 2 months
adv_months_list = [2] # futures to look at (in months) e.g. look at the contract 2 and 4 months from now

# Set lag variable params
lag = 2 # Set lag time, 2 = prediction for 2 months
lags_by_variable = {'cust_lag': [lag],
                   'oil_lag': [lag],
                   'fx_lag': [lag],
                   'sentiment_lag': [lag]} # all lags must be greater than initial lag period

# Sentiment cols_to_keep
sentiment_cols = ['crude oil', 'palm oil', 'ethanol', 'grains']

In [None]:
def get_train_test_split(lag, oil, oil_file, first_date, last_date, ema_lookback, pct_lookback, adv_months_list, cols_to_keep):
       # Set lag variable params
    lags_by_variable = {'cust_lag': [lag],
                       'oil_lag': [lag],
                       'fx_lag': [lag],
                       'sentiment_lag': [lag]} # all lags must be greater than initial lag period

    # Generate df
    # Generate customer df
    all_cust_monthly_df = prep.get_lagged_customer_df(oil = oil, 
                                                      first_date = first_date, 
                                                      last_date = last_date, 
                                                      lags_by_variable = lags_by_variable, 
                                                      verbose = False)

    # Generate oils and fx dfs
    oils_df, crude_df, fx_df = prep.get_lagged_futures_df(oil_file = oil_file, 
                                                          first_date = first_date, 
                                                          last_date = last_date, 
                                                          lags_by_variable = lags_by_variable,
                                                          ema_lookback = ema_lookback,
                                                          pct_lookback = pct_lookback,
                                                          adv_months_list = adv_months_list,
                                                          verbose = False)

    # Generate monthly sentiment df
    sentiment = prep.get_sentiment_df(lags_by_variable = lags_by_variable, 
                                      cols_to_keep = sentiment_cols, 
                                      verbose = False)

    # Merge all dfs
    combined_df = prep.merge_dfs(all_cust_monthly_df = all_cust_monthly_df, 
                                 remove_scenario = remove_scenario, 
                                 oils_df = oils_df, 
                                 crude_df = crude_df, 
                                 fx_df = fx_df, 
                                 sentiment_df = sentiment, 
                                 verbose = True)

    # Prediction prep
    # Keep columes with 'lag'
    pred = combined_df.copy()

    # Remove columns unrelated to prediction
    remove = [v for v in pred.columns if 'volume' in v] + [v for v in pred.columns if 'ema' in v] 
    other_cols = ['date'] + remove
    pred = pred.drop(other_cols, axis = 1)

    # Drop na
    pred = pred.replace([np.inf, -np.inf], 0)

    # pred.info()
    print('Length before dropping na (lag rows): {}'.format(len(pred)))

    pred = pred.dropna() # remove lag empty rows
    print('Length after dropping na (lag rows): {}'.format(len(pred)))

    # # Change data types
    pred = change_dtypes(pred)
    
    # Define Train-Test split
    Y = pred['sale'].astype(int)
    cols_to_drop = ['sale', 'sales_on_month'] # remove highly correlated features manually
    X = pred.drop(cols_to_drop, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_train_test_split(lag = 1, # change this
                                                        oil = oil, 
                                                        oil_file = oil_file, 
                                                        first_date =first_date, 
                                                        last_date = last_date,
                                                        ema_lookback = ema_lookback, 
                                                        pct_lookback = pct_lookback, 
                                                        adv_months_list = adv_months_list, 
                                                        cols_to_keep = sentiment_cols)

Generating Customer DF...
Length of Customer DF: 49920
Lagging customer variables...
Lagged customer df of len 49920 has been created containing 1040 unique customers.
Generating FUTURES DFs...
Lagging variables...
Created oils_df of length 48, crude_df of length 48 and fx_df of length 48.
You are getting sentiment for: ['crude oil', 'palm oil', 'ethanol', 'grains']
Created sentiment df of length 51
Remove customer columns: ['%purchase_on_month', '%purchase_on_month_lag_2', 'purchase(t-1)_cust', 'purchases_on_month(t-1)_cust', 'purchase_quantity_on_month(t-1)_cust', 'cumulative_purchases_till_month(t-1)_cust', 'purchase_frequency(t-1)_cust', 'purchase_frequency_over_year(t-1)_cust', 'purchase_frequency_over_quarter(t-1)_cust', 'months_since_purchase(t-1)_cust', 'purchase', 'purchases_on_month', 'counter_party_code', 'commodity']
Created combined_df for predection of len 49920 and first date 2018-01-31 00:00:00
Length before dropping na (lag rows): 49920
Length after dropping na (lag ro

In [None]:
# Load pickle model
filename = 'best_model/palm_lgbm/lgbm_palm_1M.sav'
best_model = pickle.load(open(filename, 'rb'))

# Make prediction and score
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, pos_label=1)
print(test_f1)

0.5517241379310346


## 2. Sunflower Oil

In [None]:
# SET ALL PARAMS HERE
# Random state
random_state = 42

# Oils
oil = 'SUNFLOWER OIL'# Set oil type
oil_file = 'data/sunfloweroil_futures.csv'
remove_scenario = 'purchase' # if you want to predict sales, remove purchase. if predicting purchase, remove sales.
first_date = '2018-01-01' # Get data up to this date for prediction
last_date = '2022-01-01' # Last date at which customer data will cut off, default first_date = '2020-01-01'

# Set futures lookback params
ema_lookback = [2] # 1 month and 2 months
pct_lookback = [2] # 1 month and 2 months
adv_months_list = [2] # futures to look at (in months) e.g. look at the contract 2 and 4 months from now

# Set lag variable params
lag = 2 # Set lag time, 2 = prediction for 2 months
lags_by_variable = {'cust_lag': [lag],
                   'oil_lag': [lag],
                   'fx_lag': [lag],
                   'sentiment_lag': [lag]} # all lags must be greater than initial lag period

# Sentiment cols_to_keep
sentiment_cols = ['crude oil', 'crude oil_Count', 'sunflower oil', 'sunflower oil_Count']

In [None]:
def get_train_test_split(lag, oil, oil_file, first_date, last_date, ema_lookback, pct_lookback, adv_months_list, cols_to_keep):
       # Set lag variable params
    lags_by_variable = {'cust_lag': [lag],
                       'oil_lag': [lag],
                       'fx_lag': [lag],
                       'sentiment_lag': [lag]} # all lags must be greater than initial lag period

    # Generate df
    # Generate customer df
    all_cust_monthly_df = prep.get_lagged_customer_df(oil = oil, 
                                                      first_date = first_date, 
                                                      last_date = last_date, 
                                                      lags_by_variable = lags_by_variable, 
                                                      verbose = False)

    # Generate oils and fx dfs
    oils_df, crude_df, fx_df = prep.get_lagged_futures_df(oil_file = oil_file, 
                                                          first_date = first_date, 
                                                          last_date = last_date, 
                                                          lags_by_variable = lags_by_variable,
                                                          ema_lookback = ema_lookback,
                                                          pct_lookback = pct_lookback,
                                                          adv_months_list = adv_months_list,
                                                          verbose = False)

    # Generate monthly sentiment df
    sentiment = prep.get_sentiment_df(lags_by_variable = lags_by_variable, 
                                      cols_to_keep = sentiment_cols, 
                                      verbose = False)

    # Merge all dfs
    combined_df = prep.merge_dfs(all_cust_monthly_df = all_cust_monthly_df, 
                                 remove_scenario = remove_scenario, 
                                 oils_df = oils_df, 
                                 crude_df = crude_df, 
                                 fx_df = fx_df, 
                                 sentiment_df = sentiment, 
                                 verbose = True)

    # Prediction prep
    # Keep columes with 'lag'
    pred = combined_df.copy()

    # Remove columns unrelated to prediction
    remove =  [v for v in pred.columns if 'volume' in v] + [v for v in pred.columns if 'ema' in v] + [v for v in pred.columns if 'openinterest' in v] 
    other_cols = ['date'] + remove
    pred = pred.drop(other_cols, axis = 1)

    # Drop na
    pred = pred.replace([np.inf, -np.inf], 0)

    # pred.info()
    print('Length before dropping na (lag rows): {}'.format(len(pred)))

    pred = pred.dropna() # remove lag empty rows
    print('Length after dropping na (lag rows): {}'.format(len(pred)))

    # # Change data types
    pred = change_dtypes(pred)
    
    # Define Train-Test split
    Y = pred['sale'].astype(int)
    cols_to_drop = ['sale', 'sales_on_month'] # remove highly correlated features manually
    X = pred.drop(cols_to_drop, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_train_test_split(lag = 2, # change this
                                                        oil = oil, 
                                                        oil_file = oil_file, 
                                                        first_date =first_date, 
                                                        last_date = last_date,
                                                        ema_lookback = ema_lookback, 
                                                        pct_lookback = pct_lookback, 
                                                        adv_months_list = adv_months_list, 
                                                        cols_to_keep = sentiment_cols)

Generating Customer DF...
Length of Customer DF: 53232
Lagging customer variables...
Lagged customer df of len 53232 has been created containing 1109 unique customers.
Generating FUTURES DFs...
Lagging variables...
Created oils_df of length 48, crude_df of length 48 and fx_df of length 48.
You are getting sentiment for: ['crude oil', 'crude oil_Count', 'sunflower oil', 'sunflower oil_Count']
Created sentiment df of length 51
Remove customer columns: ['%purchase_on_month', '%purchase_on_month_lag_2', 'purchase(t-2)_cust', 'purchases_on_month(t-2)_cust', 'purchase_quantity_on_month(t-2)_cust', 'cumulative_purchases_till_month(t-2)_cust', 'purchase_frequency(t-2)_cust', 'purchase_frequency_over_year(t-2)_cust', 'purchase_frequency_over_quarter(t-2)_cust', 'months_since_purchase(t-2)_cust', 'purchase', 'purchases_on_month', 'counter_party_code', 'commodity']
Created combined_df for predection of len 53232 and first date 2018-01-31 00:00:00
Length before dropping na (lag rows): 53232
Length

In [None]:
# Load pickle model
filename = 'best_model/sunflower_lgbm/lgbm_sunflower_2M.sav'
best_model = pickle.load(open(filename, 'rb'))

# Make prediction and score
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, pos_label=1)
print(test_f1)

0.46587658357172046


## 3. Rapeseed Oil

In [None]:
# SET ALL PARAMS HERE
# Random state
random_state = 42

# Oils
oil = 'RAPE OIL'# Set oil type
oil_file = 'data/rapeseedoil_futures.csv'
remove_scenario = 'purchase' # if you want to predict sales, remove purchase. if predicting purchase, remove sales.
first_date = '2018-01-01' # Get data up to this date for prediction
last_date = '2022-01-01' # Last date at which customer data will cut off, default first_date = '2020-01-01'

# Set futures lookback params
ema_lookback = [2] 
pct_lookback = [2] 
adv_months_list = [2] # futures to look at (in months) e.g. look at the contract 2 months from now

# Set lag variable params
lag = 2 # Set lag time, 2 = prediction for 2 months
lags_by_variable = {'cust_lag': [lag],
                   'oil_lag': [lag],
                   'fx_lag': [lag],
                   'sentiment_lag': [lag]} # all lags must be greater than initial lag period

# Sentiment cols_to_keep
sentiment_cols = ['crude oil', 'crude oil_Count', 'rapeseed oil', 'rapeseed oil_Count']

In [None]:
def get_train_test_split(lag, oil, oil_file, first_date, last_date, ema_lookback, pct_lookback, adv_months_list, cols_to_keep):
       # Set lag variable params
    lags_by_variable = {'cust_lag': [lag],
                       'oil_lag': [lag],
                       'fx_lag': [lag],
                       'sentiment_lag': [lag]} # all lags must be greater than initial lag period

    # Generate df
    # Generate customer df
    all_cust_monthly_df = prep.get_lagged_customer_df(oil = oil, 
                                                      first_date = first_date, 
                                                      last_date = last_date, 
                                                      lags_by_variable = lags_by_variable, 
                                                      verbose = False)

    # Generate oils and fx dfs
    oils_df, crude_df, fx_df = prep.get_lagged_futures_df(oil_file = oil_file, 
                                                          first_date = first_date, 
                                                          last_date = last_date, 
                                                          lags_by_variable = lags_by_variable,
                                                          ema_lookback = ema_lookback,
                                                          pct_lookback = pct_lookback,
                                                          adv_months_list = adv_months_list,
                                                          verbose = False)

    # Generate monthly sentiment df
    sentiment = prep.get_sentiment_df(lags_by_variable = lags_by_variable, 
                                      cols_to_keep = sentiment_cols, 
                                      verbose = False)

    # Merge all dfs
    combined_df = prep.merge_dfs(all_cust_monthly_df = all_cust_monthly_df, 
                                 remove_scenario = remove_scenario, 
                                 oils_df = oils_df, 
                                 crude_df = crude_df, 
                                 fx_df = fx_df, 
                                 sentiment_df = sentiment, 
                                 verbose = True)

    # Prediction prep
    # Keep columes with 'lag'
    pred = combined_df.copy()

    # Remove columns unrelated to prediction
    remove =  [v for v in pred.columns if 'volume' in v] + [v for v in pred.columns if 'ema' in v] + [v for v in pred.columns if 'openinterest' in v] 
    other_cols = ['date'] + remove
    pred = pred.drop(other_cols, axis = 1)

    # Drop na
    pred = pred.replace([np.inf, -np.inf], 0)

    # pred.info()
    print('Length before dropping na (lag rows): {}'.format(len(pred)))

    pred = pred.dropna() # remove lag empty rows
    print('Length after dropping na (lag rows): {}'.format(len(pred)))

    # # Change data types
    pred = change_dtypes(pred)
    
    # Define Train-Test split
    Y = pred['sale'].astype(int)
    cols_to_drop = ['sale', 'sales_on_month'] # remove highly correlated features manually
    X = pred.drop(cols_to_drop, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_train_test_split(lag = 2, # change this
                                                        oil = oil, 
                                                        oil_file = oil_file, 
                                                        first_date =first_date, 
                                                        last_date = last_date,
                                                        ema_lookback = ema_lookback, 
                                                        pct_lookback = pct_lookback, 
                                                        adv_months_list = adv_months_list, 
                                                        cols_to_keep = sentiment_cols)

Generating Customer DF...
Length of Customer DF: 31824
Lagging customer variables...
Lagged customer df of len 31824 has been created containing 663 unique customers.
Generating FUTURES DFs...
Lagging variables...
Created oils_df of length 48, crude_df of length 48 and fx_df of length 48.
You are getting sentiment for: ['crude oil', 'crude oil_Count', 'rapeseed oil', 'rapeseed oil_Count']
Created sentiment df of length 51
Remove customer columns: ['%purchase_on_month', '%purchase_on_month_lag_2', 'purchase(t-2)_cust', 'purchases_on_month(t-2)_cust', 'purchase_quantity_on_month(t-2)_cust', 'cumulative_purchases_till_month(t-2)_cust', 'purchase_frequency(t-2)_cust', 'purchase_frequency_over_year(t-2)_cust', 'purchase_frequency_over_quarter(t-2)_cust', 'months_since_purchase(t-2)_cust', 'purchase', 'purchases_on_month', 'counter_party_code', 'commodity']
Created combined_df for predection of len 31824 and first date 2018-01-31 00:00:00
Length before dropping na (lag rows): 31824
Length af

In [None]:
# Load pickle model
filename = 'best_model/rapeseed_lgbm/lgbm_rapeseed_2M.sav'
best_model = pickle.load(open(filename, 'rb'))

# Make prediction and score
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, pos_label=1)
print(test_f1)

0.5371655104063429


## 4.Soybean Oil

In [4]:
def get_train_test_split(lag, oil, oil_file, first_date, last_date, ema_lookback, pct_lookback, adv_months_list, cols_to_keep):
       # Set lag variable params
    lags_by_variable = {'cust_lag': [lag],
                       'oil_lag': [lag],
                       'fx_lag': [lag],
                       'sentiment_lag': [lag]} # all lags must be greater than initial lag period

    # Generate df
    # Generate customer df
    all_cust_monthly_df = prep.get_lagged_customer_df(oil = oil, 
                                                      first_date = first_date, 
                                                      last_date = last_date, 
                                                      lags_by_variable = lags_by_variable, 
                                                      verbose = False)

    # Generate oils and fx dfs
    oils_df, crude_df, fx_df = prep.get_lagged_futures_df(oil_file = oil_file, 
                                                          first_date = first_date, 
                                                          last_date = last_date, 
                                                          lags_by_variable = lags_by_variable,
                                                          ema_lookback = ema_lookback,
                                                          pct_lookback = pct_lookback,
                                                          adv_months_list = adv_months_list,
                                                          verbose = False)

    # Generate monthly sentiment df
    sentiment = prep.get_sentiment_df(lags_by_variable = lags_by_variable, 
                                      cols_to_keep = sentiment_cols, 
                                      verbose = False)

    # Merge all dfs
    combined_df = prep.merge_dfs(all_cust_monthly_df = all_cust_monthly_df, 
                                 remove_scenario = remove_scenario, 
                                 oils_df = oils_df, 
                                 crude_df = crude_df, 
                                 fx_df = fx_df, 
                                 sentiment_df = sentiment, 
                                 verbose = True)

    # Prediction prep
    # Keep columes with 'lag'
    pred = combined_df.copy()

    # Remove columns unrelated to prediction
    remove = [v for v in pred.columns if 'volume' in v] + [v for v in pred.columns if 'ema' in v] 
    other_cols = ['date'] + remove
    pred = pred.drop(other_cols, axis = 1)

    # Drop na
    pred = pred.replace([np.inf, -np.inf], 0)

    # pred.info()
    print('Length before dropping na (lag rows): {}'.format(len(pred)))

    pred = pred.dropna() # remove lag empty rows
    print('Length after dropping na (lag rows): {}'.format(len(pred)))

    # # Change data types
    pred = change_dtypes(pred)
    
    # Define Train-Test split
    Y = pred['sale'].astype(int)
    cols_to_drop = ['sale', 'sales_on_month'] # remove highly correlated features manually
    X = pred.drop(cols_to_drop, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [5]:
# SET ALL PARAMS HERE
# Random state
random_state = 42

# Oils
oil = 'SOYBEAN OIL'# Set oil type
oil_file = 'data/soyoil_futures.csv'
remove_scenario = 'purchase' # if you want to predict sales, remove purchase. if predicting purchase, remove sales.
first_date = '2018-01-01' # Get data up to this date for prediction
last_date = '2022-01-01' # Last date at which customer data will cut off, default first_date = '2020-01-01'

# Set futures lookback params
ema_lookback = [2] # 1 month and 2 months
pct_lookback = [2] # 1 month and 2 months
adv_months_list = [2] # futures to look at (in months) e.g. look at the contract 2 and 4 months from now

# Set lag variable params
lag = 2 # Set lag time, 2 = prediction for 2 months
lags_by_variable = {'cust_lag': [lag],
                   'oil_lag': [lag],
                   'fx_lag': [lag],
                   'sentiment_lag': [lag]} # all lags must be greater than initial lag period

# Sentiment cols_to_keep
sentiment_cols = ['crude oil','crude oil_Count', 'soy', 'soy_Count', 'grains', 'ethanol']

In [6]:
X_train, X_test, y_train, y_test = get_train_test_split(lag = 1, # change this
                                                        oil = oil, 
                                                        oil_file = oil_file, 
                                                        first_date =first_date, 
                                                        last_date = last_date,
                                                        ema_lookback = ema_lookback, 
                                                        pct_lookback = pct_lookback, 
                                                        adv_months_list = adv_months_list, 
                                                        cols_to_keep = sentiment_cols)

Generating Customer DF...
Length of Customer DF: 21696
Lagging customer variables...
Lagged customer df of len 21696 has been created containing 452 unique customers.
Generating FUTURES DFs...
Lagging variables...
Created oils_df of length 48, crude_df of length 48 and fx_df of length 48.
You are getting sentiment for: ['crude oil', 'crude oil_Count', 'soy', 'soy_Count', 'grains', 'ethanol']
Created sentiment df of length 51
Remove customer columns: ['%purchase_on_month', '%purchase_on_month_lag_2', 'purchase(t-1)_cust', 'purchases_on_month(t-1)_cust', 'purchase_quantity_on_month(t-1)_cust', 'cumulative_purchases_till_month(t-1)_cust', 'purchase_frequency(t-1)_cust', 'purchase_frequency_over_year(t-1)_cust', 'purchase_frequency_over_quarter(t-1)_cust', 'months_since_purchase(t-1)_cust', 'purchase', 'purchases_on_month', 'counter_party_code', 'commodity']
Created combined_df for predection of len 21696 and first date 2018-01-31 00:00:00
Length before dropping na (lag rows): 21696
Length

In [7]:
# Load pickle model
filename = 'best_model/soy_lgbm/lgbm_soy_1M.sav'
best_model = pickle.load(open(filename, 'rb'))

# Make prediction and score
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, pos_label=1)
print(test_f1)

0.5563380281690141


## 5. Coconut Oil

In [None]:
# SET ALL PARAMS HERE
# Random state
random_state = 42

# Oils
oil = 'COCONUT OIL'# Set oil type
remove_scenario = 'purchase' # if you want to predict sales, remove purchase. if predicting purchase, remove sales.
first_date = '2018-01-01' # Get data up to this date for prediction
last_date = '2022-01-01' # Last date at which customer data will cut off, default first_date = '2020-01-01'

# Set lag variable params
lag = 2 # Set lag time, 2 = prediction for 2 months
lags_by_variable = {'cust_lag': [lag],
                   'sentiment_lag': [lag]} # all lags must be greater than initial lag period

# Sentiment cols_to_keep
sentiment_cols = ['coconut oil', 'coconut oil_Count', 'crude oil', 'crude oil_Count']

In [None]:
def get_train_test_split(lag, oil, first_date, last_date, remove_scenario, cols_to_keep):
       # Set lag variable params
    lags_by_variable = {'cust_lag': [lag],
                       'oil_lag': [lag],
                       'fx_lag': [lag],
                       'sentiment_lag': [lag]} # all lags must be greater than initial lag period

    # Generate df
    # Generate customer df
    all_cust_monthly_df = prep.get_lagged_customer_df(oil = oil, 
                                                      first_date = first_date, 
                                                      last_date = last_date, 
                                                      lags_by_variable = lags_by_variable, 
                                                      verbose = False)

    # Generate monthly sentiment df
    sentiment = prep.get_sentiment_df(lags_by_variable = lags_by_variable, 
                                      cols_to_keep = sentiment_cols, 
                                      verbose = False)

    # Remove either purchase or sales
    remove = [v for v in all_cust_monthly_df.columns if remove_scenario in v] + ['counter_party_code', 'commodity']
    print('Remove customer columns: {}'.format(remove))

    all_cust_monthly_df_subset = all_cust_monthly_df.drop(remove, axis = 1)
    combined_df = all_cust_monthly_df_subset.merge(sentiment, how = 'left', on = 'date') # merge customer to oils

    # Prediction prep
    # Keep columes with 'lag'
    pred = combined_df.copy()

    # Remove columns unrelated to prediction
    remove = [v for v in pred.columns if 'volume' in v] + [v for v in pred.columns if 'ema' in v] 
    other_cols = ['date'] + remove
    pred = pred.drop(other_cols, axis = 1)

    # Drop na
    pred = pred.replace([np.inf, -np.inf], 0)

    # pred.info()
    print('Length before dropping na (lag rows): {}'.format(len(pred)))

    pred = pred.dropna() # remove lag empty rows
    print('Length after dropping na (lag rows): {}'.format(len(pred)))

    # # Change data types
    pred = change_dtypes(pred)
    
    # Define Train-Test split
    Y = pred['sale'].astype(int)
    cols_to_drop = ['sale', 'sales_on_month'] # remove highly correlated features manually
    X = pred.drop(cols_to_drop, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_train_test_split(lag = 2, # for prediction 2 months in advance
                                                        oil = oil, 
                                                        first_date =first_date, 
                                                        last_date = last_date,
                                                        remove_scenario = remove_scenario,
                                                        cols_to_keep = sentiment_cols)

Generating Customer DF...
Length of Customer DF: 13872
Lagging customer variables...
Lagged customer df of len 13872 has been created containing 289 unique customers.
You are getting sentiment for: ['coconut oil', 'coconut oil_Count', 'crude oil', 'crude oil_Count']
Created sentiment df of length 51
Remove customer columns: ['%purchase_on_month', '%purchase_on_month_lag_2', 'purchase(t-2)_cust', 'purchases_on_month(t-2)_cust', 'purchase_quantity_on_month(t-2)_cust', 'cumulative_purchases_till_month(t-2)_cust', 'purchase_frequency(t-2)_cust', 'purchase_frequency_over_year(t-2)_cust', 'purchase_frequency_over_quarter(t-2)_cust', 'months_since_purchase(t-2)_cust', 'purchase', 'purchases_on_month', 'counter_party_code', 'commodity']
Length before dropping na (lag rows): 13872
Length after dropping na (lag rows): 13294


In [None]:
# Load pickle model
filename = 'best_model/coconut_lgbm/lgbm_coconut_2M.sav'
best_model = pickle.load(open(filename, 'rb'))

# Make prediction and score
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, pos_label=1)
print(test_f1)

0.5624543462381301


## 6. Palm Kernel Oil

In [8]:
def get_train_test_split(lag, oil, oil_file, first_date, last_date, ema_lookback, pct_lookback, adv_months_list, cols_to_keep):
       # Set lag variable params
    lags_by_variable = {'cust_lag': [lag],
                       'oil_lag': [lag],
                       'fx_lag': [lag],
                       'sentiment_lag': [lag]} # all lags must be greater than initial lag period

    # Generate df
    # Generate customer df
    all_cust_monthly_df = prep.get_lagged_customer_df(oil = oil, 
                                                      first_date = first_date, 
                                                      last_date = last_date, 
                                                      lags_by_variable = lags_by_variable, 
                                                      verbose = False)

    # Generate oils and fx dfs
    oils_df, crude_df, fx_df = prep.get_lagged_futures_df(oil_file = oil_file, 
                                                          first_date = first_date, 
                                                          last_date = last_date, 
                                                          lags_by_variable = lags_by_variable,
                                                          ema_lookback = ema_lookback,
                                                          pct_lookback = pct_lookback,
                                                          adv_months_list = adv_months_list,
                                                          verbose = False)

    # Generate monthly sentiment df
    sentiment = prep.get_sentiment_df(lags_by_variable = lags_by_variable, 
                                      cols_to_keep = sentiment_cols, 
                                      verbose = False)

    # Merge all dfs
    combined_df = prep.merge_dfs(all_cust_monthly_df = all_cust_monthly_df, 
                                 remove_scenario = remove_scenario, 
                                 oils_df = oils_df, 
                                 crude_df = crude_df, 
                                 fx_df = fx_df, 
                                 sentiment_df = sentiment, 
                                 verbose = True)

    # Prediction prep
    # Keep columes with 'lag'
    pred = combined_df.copy()

    # Remove columns unrelated to prediction
    remove =  [v for v in pred.columns if 'volume' in v] + [v for v in pred.columns if 'ema' in v] + [v for v in pred.columns if 'openinterest' in v] 
    other_cols = ['date'] + remove
    pred = pred.drop(other_cols, axis = 1)

    # Drop na
    pred = pred.replace([np.inf, -np.inf], 0)

    # pred.info()
    print('Length before dropping na (lag rows): {}'.format(len(pred)))

    pred = pred.dropna() # remove lag empty rows
    print('Length after dropping na (lag rows): {}'.format(len(pred)))

    # # Change data types
    pred = change_dtypes(pred)
    
    # Define Train-Test split
    Y = pred['sale'].astype(int)
    cols_to_drop = ['sale', 'sales_on_month'] # remove highly correlated features manually
    X = pred.drop(cols_to_drop, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [9]:
# SET ALL PARAMS HERE
# Random state
random_state = 42

# Oils
oil = 'PALM KERNEL OIL'# Set oil type
oil_file = 'data/palmkernaloil_futures.csv'
remove_scenario = 'purchase' # if you want to predict sales, remove purchase. if predicting purchase, remove sales.
first_date = '2018-01-01' # Get data up to this date for prediction
last_date = '2022-01-01' # Last date at which customer data will cut off, default first_date = '2020-01-01'

# Set futures lookback params
ema_lookback = [2] # 1 month and 2 months
pct_lookback = [2] # 1 month and 2 months
adv_months_list = [2] # futures to look at (in months) e.g. look at the contract 2 and 4 months from now

# Set lag variable params
lag = 2 # Set lag time, 2 = prediction for 2 months
lags_by_variable = {'cust_lag': [lag],
                   'oil_lag': [lag],
                   'fx_lag': [lag],
                   'sentiment_lag': [lag]} # all lags must be greater than initial lag period

# Sentiment cols_to_keep
sentiment_cols = ['crude oil', 'crude oil_Count', 'palm oil', 'palm oil_Count']

In [10]:
X_train, X_test, y_train, y_test = get_train_test_split(lag = 2, # change this
                                                        oil = oil, 
                                                        oil_file = oil_file, 
                                                        first_date =first_date, 
                                                        last_date = last_date,
                                                        ema_lookback = ema_lookback, 
                                                        pct_lookback = pct_lookback, 
                                                        adv_months_list = adv_months_list, 
                                                        cols_to_keep = sentiment_cols)

Generating Customer DF...
Length of Customer DF: 19200
Lagging customer variables...
Lagged customer df of len 19200 has been created containing 400 unique customers.
Generating FUTURES DFs...
Lagging variables...
Created oils_df of length 48, crude_df of length 48 and fx_df of length 48.
You are getting sentiment for: ['crude oil', 'crude oil_Count', 'palm oil', 'palm oil_Count']
Created sentiment df of length 51
Remove customer columns: ['%purchase_on_month', '%purchase_on_month_lag_2', 'purchase(t-2)_cust', 'purchases_on_month(t-2)_cust', 'purchase_quantity_on_month(t-2)_cust', 'cumulative_purchases_till_month(t-2)_cust', 'purchase_frequency(t-2)_cust', 'purchase_frequency_over_year(t-2)_cust', 'purchase_frequency_over_quarter(t-2)_cust', 'months_since_purchase(t-2)_cust', 'purchase', 'purchases_on_month', 'counter_party_code', 'commodity']
Created combined_df for predection of len 19200 and first date 2018-01-31 00:00:00
Length before dropping na (lag rows): 19200
Length after drop

In [11]:
# Load pickle model
filename = 'best_model/palmkernel_lgbm/lgbm_palmkernel_2M.sav'
best_model = pickle.load(open(filename, 'rb'))

# Make prediction and score
y_pred = best_model.predict(X_test)
test_f1 = f1_score(y_test, y_pred, pos_label=1)
print(test_f1)

0.4718562874251497
