# TRAINING DIFFERENT MODELS ON FINAL PREPROCESSED DATASET

### IMPORT PACKAGES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score




warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)
# Create Dataframe
df = pd.read_csv("processed_crypto_with_target.csv")
# Print shape of dataset
print(df.shape)

(72946, 74)


### split x and y

In [2]:
# Split the data into features and target
X = df.drop(columns=['Volatility_7_target'])
y = df['Volatility_7_target']


### SELECTION OF BEST MODEL


In [3]:
from  sklearn.linear_model import LinearRegression,ElasticNet,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, n_jobs=-1),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "XGBRegressor": XGBRegressor(n_estimators=200, verbosity=0, n_jobs=-1),
    "CatBoostRegressor": CatBoostRegressor(n_estimators=200, verbose=0),
    "LGBMRegressor": LGBMRegressor(n_estimators=200, verbose=0),
    "SVR": SVR(),
    "KNeighborsRegressor": KNeighborsRegressor(),

}



In [4]:


def evaluate_models_time_series(X, y, models, test_size=0.2):
    """
    Evaluates regression models using time-series split.
    """
    # Determine split index
    split_index = int(len(X) * (1 - test_size))
    
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

    model_names, rmse_scores, r2_scores = [], [], []

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        print(f"{name} → RMSE: {rmse:.4f}, R2: {r2:.4f}")

        model_names.append(name)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

    return pd.DataFrame({
        "Model": model_names,
        "RMSE": rmse_scores,
        "R2": r2_scores
    }).sort_values(by="RMSE")


In [5]:
report = evaluate_models_time_series(X, y, models)


LinearRegression → RMSE: 32.3658, R2: 0.2984
Ridge → RMSE: 32.3472, R2: 0.2992
Lasso → RMSE: 32.6487, R2: 0.2860
ElasticNet → RMSE: 33.2521, R2: 0.2594
DecisionTreeRegressor → RMSE: 39.4679, R2: -0.0434
RandomForestRegressor → RMSE: 27.7280, R2: 0.4850
GradientBoostingRegressor → RMSE: 27.7141, R2: 0.4855
AdaBoostRegressor → RMSE: 37.3831, R2: 0.0640
XGBRegressor → RMSE: 28.6328, R2: 0.4509
CatBoostRegressor → RMSE: 27.3257, R2: 0.4999
LGBMRegressor → RMSE: 27.0389, R2: 0.5103
SVR → RMSE: 39.5679, R2: -0.0486
KNeighborsRegressor → RMSE: 39.6628, R2: -0.0537


In [6]:
report.sort_values(by="RMSE")

Unnamed: 0,Model,RMSE,R2
10,LGBMRegressor,27.038944,0.510309
9,CatBoostRegressor,27.325675,0.499869
6,GradientBoostingRegressor,27.7141,0.485549
5,RandomForestRegressor,27.728025,0.485032
8,XGBRegressor,28.632783,0.450877
1,Ridge,32.347196,0.299165
0,LinearRegression,32.365848,0.298357
2,Lasso,32.648699,0.28604
3,ElasticNet,33.252123,0.259405
7,AdaBoostRegressor,37.383136,0.063962


### from our report lightgbm perfrom better now do its hyperparameter tuning using grid search cv

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
lgb_model = LGBMRegressor(random_state=42)

# Define hyperparameter grid
param_grid = {
    'num_leaves': [31, 50, 70],          # number of leaves in one tree
    'learning_rate': [0.1, 0.05, 0.01], # step size shrinkage
    'n_estimators': [100, 200, 500],     # number of boosting iterations
    'max_depth': [-1, 10, 20],           # max depth of trees (-1 means no limit)
    'min_child_samples': [20, 50, 100]  # min data in one leaf
}

# Set up time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',  # Optimize for RMSE

    verbose=2,
    return_train_score=True
)

# Fit GridSearchCV
grid_search.fit(X, y)

# Print best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE (Cross-Validation):", -grid_search.best_score_)



# Feature importance
importances = pd.Series(best_model.feature_importances_,)
print("\nFeature Importance:")
print(importances.sort_values(ascending=False))

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=31; total time=   0.0s
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=31; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=31; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=31; total time=   0.2s
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=31; total time=   0.3s
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=50; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=50; total time=   0.1s
[CV] END learning_rate=0.1, max_depth=-1, min_child_samples=20, n_estimators=100, num_leaves=50; total time=   0.2s
[CV] END

TypeError: got an unexpected keyword argument 'squared'

In [15]:
# Train final model with best parameters on full training data
best_model = LGBMRegressor(**grid_search.best_params_, random_state=42)
best_model.fit(X, y)

# Optional: Evaluate on a test set (if you have a separate test set)
# For time-series, split chronologically
train_size = int(0.8 * len(X))
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Calculate RMSE and R²
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {np.sqrt(rmse)}")
print(f"Test R²: {r2}")

Test RMSE: 27.030114956069855
Test R²: 0.5106292354884812


In [16]:
import pickle as pd
# Save the model
with open("best_model.pkl", "wb") as f:
    pd.dump(best_model, f)
    

# final report LIGHTGBM PERFROM BETTER 

In [17]:
df.columns

Index(['open', 'high', 'low', 'close', 'volume', 'marketCap', 'Log_Return',
       'Volatility_14', 'Volatility_30', 'Liquidity_Ratio', 'Bollinger_Width',
       'TR', 'ATR_14', 'crypto_name_Aave', 'crypto_name_Algorand',
       'crypto_name_ApeCoin', 'crypto_name_Aptos', 'crypto_name_Avalanche',
       'crypto_name_BNB', 'crypto_name_Basic Attention Token',
       'crypto_name_Binance USD', 'crypto_name_Bitcoin',
       'crypto_name_Bitcoin Cash', 'crypto_name_Cardano', 'crypto_name_Casper',
       'crypto_name_Chain', 'crypto_name_Chainlink', 'crypto_name_Chiliz',
       'crypto_name_Cosmos', 'crypto_name_Cronos', 'crypto_name_Dai',
       'crypto_name_Decentraland', 'crypto_name_Dogecoin', 'crypto_name_EOS',
       'crypto_name_Elrond', 'crypto_name_Ethereum',
       'crypto_name_Ethereum Classic', 'crypto_name_FTX Token',
       'crypto_name_Filecoin', 'crypto_name_Flow', 'crypto_name_Hedera',
       'crypto_name_Huobi Token', 'crypto_name_Internet Computer',
       'crypto_name_Li