## Linear Regression vs Lasso and Ridge Regression for Arbitrum Prices

In [293]:
# Lasso regression and ridge regression are regularization techniques.
# Regularization helps avoid overfitting by adding a penalty term to the best fit derived from the trained data,
# to achieve a lower variance with the tested data and restrict the influence of predictor variables over the output variable
# Shrinkage is where the data values are shrunk toward a central point as the mean.
# Lasso and ridge regression encourage simple, sparse models (fewer parameters)
# This type of regression is well-suited for models showing high levels of multicollinearity
# or when you want to automate certain parts of model selection, like variable selection/parameter elimination

# In Lasso regression, the cost function adds a penalty that is the absolute value of the magnitude of the coefficients

# In Ridge regression, the cost function adds a penalty that is equivalent to the square of the magnitude of the coefficients

In [294]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [312]:
# Import price data for 1-minute intervals for Arbitrum, Bitcoin, Ethereum, Optimism, and Polygon
# See if Arbitrum's price action is driven by the others
imported_data = pd.read_csv("../Desktop/crypto_data_python/arb_and_comp_042823_1min.csv")
# print(imported_data.head())

In [313]:
imported_data.rename(columns={"open": "arb_open", "high": "arb_high", "low": "arb_low", "close": "arb_close"}, inplace=True)
imported_data.rename(columns={"BTCUSD, BINANCE: Open": "btc_open",  "BTCUSD, BINANCE: High": "btc_high", "BTCUSD, BINANCE: Low": "btc_low", "BTCUSD, BINANCE: Close": "btc_close"}, inplace=True)
imported_data.rename(columns={"ETHUSD, BINANCE: Open": "eth_open", "ETHUSD, BINANCE: High": "eth_high", "ETHUSD, BINANCE: Low": "eth_low", "ETHUSD, BINANCE: Close": "eth_close"}, inplace=True)
imported_data.rename(columns={"OPUSD, BINANCE: Open": "op_open", "OPUSD, BINANCE: High": "op_high", "OPUSD, BINANCE: Low": "op_low", "OPUSD, BINANCE: Close": "op_close"}, inplace=True)
imported_data.rename(columns={"MATICUSD, BINANCE: Open": "matic_open", "MATICUSD, BINANCE: High": "matic_high", "MATICUSD, BINANCE: Low": "matic_low", "MATICUSD, BINANCE: Close": "matic_close"}, inplace=True)
imported_data.drop(columns=["RSI", "RSI-based MA", "Upper Bollinger Band", "Lower Bollinger Band"], inplace=True)
imported_data["time"] = pd.to_datetime(imported_data["time"])
# print(imported_data.head())

In [314]:
# Use only the closing data 
# Drop all OP data because there are a lot of missing data points from Binance (and also other exchanges)
columns_to_drop =  ["time", "eth_open", "eth_high", "eth_low", "op_open", "op_high", "op_low", "op_close", "btc_open", "btc_high", "btc_low", \
    "arb_open", "arb_high", "arb_low", "matic_open", "matic_high", "matic_low", "MA", "Smoothing Line", "EMA", "Smoothing Line.1", "Volume", "Volume MA"]
imported_data = imported_data.drop(columns_to_drop, axis=1)
# print(imported_data.head())

In [315]:
imported_data["matic_close_shifted"] = imported_data["matic_close"].shift(1)
imported_data["btc_close_shifted"] = imported_data["btc_close"].shift(1)
imported_data["eth_close_shifted"] = imported_data["eth_close"].shift(1)
imported_data = imported_data.dropna()
# print(imported_data.head())
# print(imported_data.tail())

In [316]:
# Separate the predictor variables from the response variable
y = imported_data["arb_close"]
X = pd.DataFrame(imported_data, columns=["btc_close", "eth_close", "matic_close"])
X_shifted = pd.DataFrame(imported_data, columns=["btc_close_shifted", "eth_close_shifted", "matic_close_shifted"])
# print(imported_data.head())

In [317]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_shifted_train, X_shifted_test, y_train, y_test = train_test_split(X_shifted, y, test_size=0.2, random_state=42)

### Linear Regression

In [349]:
# Create a Linear regression
from sklearn.linear_model import LinearRegression
lin_reg_model = LinearRegression()
train_score = lin_reg_model.fit(X_train, y_train).score(X_train, y_train)
test_score = lin_reg_model.fit(X_test, y_test).score(X_test, y_test)
y_pred = lin_reg_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Linear regression - unshifted: ")
print("train_score: ", train_score)
print("test_score: ", test_score)
print("mse: ", mse)
lin_reg_model = LinearRegression()
train_shifted_score = lin_reg_model.fit(X_shifted_train, y_train).score(X_shifted_train, y_train)
test_shifted_score = lin_reg_model.fit(X_shifted_test, y_test).score(X_shifted_test, y_test)
y_shifted_pred = lin_reg_model.predict(X_shifted_test)
mse_shifted = mean_squared_error(y_test, y_shifted_pred)
print("Linear regression - shifted: ")
print("train_score: ", train_shifted_score)
print("test_score: ", test_shifted_score)
print("mse_shifted: ", mse_shifted)


Linear regression - unshifted: 
train_score:  0.8093944338091594
test_score:  0.8784764159422679
mse:  1.4297694926804144e-05
Linear regression - shifted: 
train_score:  0.8020999571463565
test_score:  0.850093554720059
mse_shifted:  1.7637042544400532e-05


##### -> The linear regression performs better without the 1-minute shift on Bitcoin, Ethereum, and Polygon

### Lasso Regression

In [325]:
# Create a Linear regression
from sklearn.linear_model import LinearRegression
alphas = [0.0001, 0.02, 0.03, 0.04, 0.05, 0.06, 0.08, 0.10, 0.12, 0.15, 0.20]
for alpha_value in alphas:
    lasso_model = Lasso(alpha=alpha_value)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    train_score = lasso_model.score(X_train, y_train)
    test_score = lasso_model.score(X_test, y_test)
    print("alpha: ", alpha_value, ", mse: ", mse, ", train_score: ", train_score, ", test_score: ", test_score)

alpha:  0.0001 , mse:  2.8059886497948458e-05 , train_score:  0.7118744059628961 , test_score:  0.761504354866938
alpha:  0.02 , mse:  8.306628950829137e-05 , train_score:  0.2727087261889618 , test_score:  0.29397617818097455
alpha:  0.03 , mse:  0.00011277932598640406 , train_score:  0.03414345872919422 , test_score:  0.04142954709507196
alpha:  0.04 , mse:  0.00011300181647846996 , train_score:  0.03359844345001983 , test_score:  0.039538484084353454
alpha:  0.05 , mse:  0.00011323771771251175 , train_score:  0.032897709519653096 , test_score:  0.037533436166408474
alpha:  0.06 , mse:  0.00011348702968852926 , train_score:  0.03204125693809445 , test_score:  0.035414403341238576
alpha:  0.08 , mse:  0.00011402588586649005 , train_score:  0.02986119582140212 , test_score:  0.030834382969235574
alpha:  0.1 , mse:  0.00011461838501235287 , train_score:  0.027058260099937503 , test_score:  0.025798422968340007
alpha:  0.12 , mse:  0.00011526452712611798 , train_score:  0.023632449773701

##### -> Lasso does not work here as the train and test scores are worse than that of linear regression, MSE is higher,
##### and the best train and test scores come as alpha approaches 0, negating the penalty introduced by Lasso

### Ridge Regression

In [324]:
from sklearn.linear_model import Ridge
alphas = [0.0001, 0.02, 0.03, 0.04, 0.05, 0.06, 0.08, 0.10, 0.12, 0.15, 0.20]
for alpha_value in alphas:
    ridge_model = Ridge(alpha=alpha_value)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    train_score = ridge_model.score(X_train, y_train)
    test_score = ridge_model.score(X_test, y_test)
    print("alpha: ", alpha_value, ", mse: ", mse, ", train_score: ", train_score, ", test_score: ", test_score)

alpha:  0.0001 , mse:  1.4885088325767738e-05 , train_score:  0.8082108632406028 , test_score:  0.8734838523535675
alpha:  0.02 , mse:  2.707537613304483e-05 , train_score:  0.7193083213147495 , test_score:  0.7698722231630213
alpha:  0.03 , mse:  2.7384527104422265e-05 , train_score:  0.7169311143294905 , test_score:  0.7672445874322936
alpha:  0.04 , mse:  2.7543614354417413e-05 , train_score:  0.7157070689013425 , test_score:  0.7658924217233264
alpha:  0.05 , mse:  2.7640559496596625e-05 , train_score:  0.7149609384767721 , test_score:  0.7650684342767543
alpha:  0.06 , mse:  2.770582494644377e-05 , train_score:  0.7144585553337044 , test_score:  0.764513709097544
alpha:  0.08 , mse:  2.7788136500320703e-05 , train_score:  0.7138249186054201 , test_score:  0.7638141001684338
alpha:  0.1 , mse:  2.7837922665278793e-05 , train_score:  0.7134416867223095 , test_score:  0.7633909415241089
alpha:  0.12 , mse:  2.7871286201263824e-05 , train_score:  0.7131849166125541 , test_score:  0.76

##### -> Ridge regression works here, but not as well as that of linear regression. The MSE is closer to that of linear reg but still higher,
##### Still, the best train and test scores come as alpha approaches 0, negating the penalty introduced by Ridge regression

### Develop the Linear Regresion Model

In [347]:
# import statsmodels.api as sm
# X_constant = sm.add_constant(X_train)

In [342]:
train_score = lin_reg_model.fit(X_train, y_train)
test_score = lin_reg_model.fit(X_test, y_test)

In [345]:
coefficients = lin_reg_model.coef_
intercept = lin_reg_model.intercept_

In [346]:
# Coefficients are: "btc_close", "eth_close", "matic_close"
print(coefficients)
print(intercept)

[ 1.79733951e-05  1.75434902e-03 -1.99618472e+00]
-0.4788748553383557


##### arb_price = -0.478875 + (1.797339e-05 * btc_price) + (1.754349e-03 * eth_price) - (1.996184 * matic_price)