## De-BTCing

In [None]:
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import seaborn as sns

warnings.filterwarnings('ignore')

df = pd.read_csv('history.csv')
df = df[['Coin','dateTime','open','close']]
final_df = pd.DataFrame()
for coin in tqdm(df['Coin'].unique(), desc="Processing coins"):
    temp_df = df[df.Coin == coin]
    temp_df['open_pct'] = np.round(temp_df['open'].pct_change()*100,2)
    temp_df['close_pct'] = np.round(temp_df['close'].pct_change()*100,2)
    # Apply moving average smoothing to percentage changes
    # Default window size of 7 days for weekly smoothing
    window_size = 7
    
    # Create smoothed versions of the percentage changes
    temp_df['open_pct_smooth'] = temp_df['open_pct'].rolling(window=window_size, min_periods=1).mean()
    temp_df['close_pct_smooth'] = temp_df['close_pct'].rolling(window=window_size, min_periods=1).mean()
    
    # Fill any NaN values that might be created at the beginning of the series
    temp_df['open_pct_smooth'] = temp_df['open_pct_smooth'].fillna(temp_df['open_pct'])
    temp_df['close_pct_smooth'] = temp_df['close_pct_smooth'].fillna(temp_df['close_pct'])
    final_df = pd.concat([final_df, temp_df])

# High Correlation Coins

In [None]:
n_days = 60

final_df['dateTime'] = pd.to_datetime(final_df['dateTime'], format='%Y-%m-%d %H:%M:%S')
max_date = final_df.dateTime.max()
from datetime import timedelta
min_date = max_date - timedelta(days=n_days)
independent = ['BTCUSDT','ETHUSDT','BNBUSDT','XRPUSDT']
correlation_df = final_df[['Coin','dateTime', 'close_pct']].dropna()
correlation_df = correlation_df[correlation_df.dateTime > min_date].pivot_table(index = 'dateTime', columns = ['Coin'], values = 'close_pct').corr()[independent]
correlation_df = correlation_df.sort_values('BTCUSDT',ascending = False).dropna()
# Remove coins with very low correlation (less than 0.1 magnitude) with all independent coins
correlation_threshold = 0.8


correlation_df = correlation_df[(correlation_df.abs() >= correlation_threshold).any(axis=1)]
correlation_df

In [None]:
def clean_and_prepare_data(analysis_df):
    """Clean data and prepare for analysis"""
    cleaned_df = analysis_df.dropna()
    return cleaned_df.copy()

def extract_btc_data(analysis_df):
    """Extract and prepare BTC data"""
    btc = analysis_df[analysis_df.Coin == 'BTCUSDT']
    btc.drop(columns=['Coin', 'open', 'close'], inplace=True)
    return btc

def extract_nonbtc_data(analysis_df):
    """Extract non-BTC data"""
    nonbtc = analysis_df[~(analysis_df.Coin == 'BTCUSDT')].drop(columns=['open', 'close'])
    return nonbtc

def merge_btc_nonbtc_data(nonbtc, btc):
    """Merge BTC and non-BTC datasets"""
    return nonbtc.merge(btc, on='dateTime', suffixes=['', '_btc'])

def calculate_debtcfied_movements(joined_df):
    """Calculate BTC-independent price movements"""
    joined_df['de_btcfied_open'] = joined_df['open_pct'] - joined_df['open_pct_btc']
    joined_df['de_btcfied_close'] = joined_df['close_pct'] - joined_df['close_pct_btc']
    return joined_df


def plot_debtcfied_comparison(joined_df, selected_coins, n_days):
    """Plot BTC-independent price movement comparison"""
    plt.style.use('seaborn-v0_8-darkgrid')
    
    # Create figure and axes
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Plot each coin
    for coin in selected_coins:
        plotdata = joined_df[joined_df.Coin == coin].tail(n_days)
        
        # Convert dateTime to datetime if needed
        if not pd.api.types.is_datetime64_any_dtype(plotdata['dateTime']):
            plotdata['dateTime'] = pd.to_datetime(plotdata['dateTime'])
        
        # Plot with label for legend
        ax.plot(plotdata['dateTime'], plotdata['de_btcfied_close'], linewidth=2, label=coin)
    
    # Format the plot
    ax.set_title(f'BTC-Independent Price Movement Comparison', fontsize=14)
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('De-BTCfied Close (%)', fontsize=12)
    
    # Format x-axis dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.xticks(rotation=45)
    
    # Add legend and grid
    ax.legend(title='Coins', loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()

# Main execution
n_days = 60

# Process data step by step
analysis = clean_and_prepare_data(final_df)
btc = extract_btc_data(analysis)
nonbtc = extract_nonbtc_data(analysis)
joined = merge_btc_nonbtc_data(nonbtc, btc)
joined = calculate_debtcfied_movements(joined)
joined.head()


In [None]:

def filter_coins_by_performance_and_data(joined_df, n_days, min_date, min_datapoints=10):
    """Filter coins based on performance and minimum data requirements"""
    
    # Only keep coins that meet both criteria
    filtered_coins = [coin for coin in joined_df.Coin.unique() 
                     if len(joined_df[joined_df.Coin == coin]) > min_datapoints 
                     and len(joined_df[(joined_df.Coin == coin) & (joined_df.dateTime >= min_date)]) > min_datapoints]

    stats = []
    for coin in filtered_coins:
        coin_data = joined_df[joined_df.Coin == coin].tail(n_days)
        coin_stats = {
            'coin': coin,
            'coin_mean': coin_data['de_btcfied_close'].mean(),
            'coin_sd': coin_data['de_btcfied_close'].std(),
            'coin_n': len(coin_data),
            'min_date': coin_data.dateTime.min(),
            'max_date': coin_data.dateTime.max()
        }   
        stats.append(coin_stats)
    
    
    return pd.DataFrame(stats)

def conduct_hypothesis_test(stats_df, threshold=0, sided=1, alpha=0.05):
    """
    Conduct hypothesis test for each coin's mean change vs threshold
    
    Parameters:
    - stats_df: DataFrame with coin statistics (coin, coin_mean, coin_sd, coin_n)
    - threshold: The threshold value to test against (default 0)
    - sided: 1 for one-sided test (mean > threshold), 2 for two-sided test
    - alpha: Significance level (default 0.05)
    
    Returns:
    - DataFrame with test results for each coin
    """
    from scipy import stats as scipy_stats
    
    results = []
    
    for _, row in stats_df.iterrows():
        coin = row['coin']
        mean = row['coin_mean']
        sd = row['coin_sd']
        n = row['coin_n']
        
        # Calculate t-statistic
        t_stat = (mean - threshold) / (sd / np.sqrt(n))
        
        # Calculate p-value based on test type
        if sided == 1:
            # One-sided test: H0: mean <= threshold, H1: mean > threshold
            p_value = 1 - scipy_stats.t.cdf(t_stat, df=n-1)
            test_type = "one-sided (mean > threshold)"
        else:
            # Two-sided test: H0: mean = threshold, H1: mean != threshold
            p_value = 2 * (1 - scipy_stats.t.cdf(abs(t_stat), df=n-1))
            test_type = "two-sided"
        
        # Determine if result is significant
        significant = p_value < alpha
        
        result = {
            'coin': coin,
            'mean': mean,
            'threshold': threshold,
            'n': n,
            't_statistic': t_stat,
            'p_value': p_value,
            'significant': significant,
            'test_type': test_type,
            'alpha': alpha
        }
        results.append(result)
    
    results_df = pd.DataFrame(results)
    
    # Print summary
    print(f"\nHypothesis Test Results ({test_type}):")
    print(f"Threshold: {threshold}")
    print(f"Significance level: {alpha}")
    print("-" * 60)
    
    for _, row in results_df[results_df.significant == True].iterrows():
        significance = "***" if row['significant'] else ""
        print(f"{row['coin']:10} | Mean: {row['mean']:7.3f} | t-stat: {row['t_statistic']:7.3f} | p-value: {row['p_value']:7.4f} {significance}")
    
    significant_coins = results_df[results_df['significant']]['coin'].tolist()
    if significant_coins:
        print(f"\nCoins with significant evidence (p < {alpha}): {', '.join(significant_coins)}")
    else:
        print(f"\nNo coins show significant evidence at α = {alpha}")
    
    return results_df
joined['dateTime'] = pd.to_datetime(joined['dateTime'], format='%Y-%m-%d %H:%M:%S')
max_date = joined.dateTime.max()
from datetime import timedelta
min_date = max_date - timedelta(days=n_days)

# Filter coins and create plot
stats = filter_coins_by_performance_and_data(joined, n_days, min_date = min_date, min_datapoints=0.40*n_days)
statistics = conduct_hypothesis_test(stats, threshold=0, sided=1, alpha = 0.15)
statistics.head()
 

In [None]:
selected_coins = statistics[statistics.p_value < 0.11]['coin'].values

# First plot: close_pct for selected coins
plt.figure(figsize=(12, 6))
for coin in list(selected_coins) + ['BTCUSDT']:
    coin_data = final_df[final_df.Coin == coin].tail(n_days)
    if coin == 'BTCUSDT':
        plt.plot(coin_data['dateTime'], coin_data['close_pct'], label=f'{coin}', alpha=1, linewidth=3)
    else:
        plt.plot(coin_data['dateTime'], coin_data['close_pct'], label=f'{coin}', alpha=0.7)

plt.xlabel('Date', fontsize=12)
plt.ylabel('Close Percentage (%)', fontsize=12)
plt.title('Close Percentage for Selected Coins', fontsize=14)
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Second plot: de_btcfied_close for selected coins
plt.figure(figsize=(12, 6))
for coin in selected_coins:
    coin_data = joined[joined.Coin == coin].tail(n_days)
    plt.plot(coin_data['dateTime'], coin_data['de_btcfied_close'], label=f'{coin}', alpha=0.7)

plt.xlabel('Date', fontsize=12)
plt.ylabel('De-BTCfied Close (%)', fontsize=12)
plt.title('De-BTCfied Close for Selected Coins', fontsize=14)
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Display the data table
joined[joined.Coin.isin(selected_coins)]

In [None]:
# Create a new figure for the bell curves (normal distribution plots)
plt.figure(figsize=(10, 6))

# For each selected coin, plot its distribution
for coin in selected_coins:
    # Get the data for this coin
    coin_data = joined[joined.Coin == coin]['de_btcfied_close'].dropna().tail(n_days)
    plt.hist(coin_data, bins=8, alpha=0.5, label=coin, density=True)

# Improve formatting
plt.title('Distribution of BTC-Independent Price Movements', fontsize=14)
plt.xlabel('De-BTCfied Close (%)', fontsize=12)
plt.ylabel('Density', fontsize=12)

# Add legend
plt.legend(title='Coins', loc='best')

# Add grid and tight layout
plt.grid(True, alpha=0.3)
plt.tight_layout()

# Show the plot
plt.show()


# Regression Approach

In [None]:
import statsmodels.api as sm
from scipy.stats import ttest_1samp
from tqdm import tqdm

# independent = ['BTCUSDT','ETHUSDT','BNBUSDT']
final_df['dateTime'] = pd.to_datetime(joined['dateTime'], format='%Y-%m-%d %H:%M:%S')

# Step 1: Prepare the data for regression analysis
final_df.dropna(inplace = True)
# Get BTC and ETH data as market factors
independent_variables = pd.DataFrame()
for coin_symbol in independent:
    temp_independent = final_df[final_df.Coin == coin_symbol][['dateTime', 'close_pct']].rename(columns={'close_pct': coin_symbol})
    if independent_variables.empty:
        independent_variables = temp_independent
    else:
        independent_variables = pd.merge(independent_variables, temp_independent, on='dateTime', how='inner')

def filter_coins_by_data(joined_df, n_days, min_date, min_datapoints=10):
    """Filter coins based on performance and minimum data requirements"""
    
    # Only keep coins that meet both criteria
    filtered_coins = [coin for coin in joined_df.Coin.unique() 
                     if len(joined_df[joined_df.Coin == coin]) > min_datapoints 
                     and len(joined_df[(joined_df.Coin == coin) & (joined_df.dateTime >= min_date)]) > min_datapoints]

    return filtered_coins


max_date = joined.dateTime.max()
from datetime import timedelta
min_date = max_date - timedelta(days=n_days)
filtered_coins =  filter_coins_by_data(final_df[~final_df.Coin.isin(independent)], n_days = 60, min_date = min_date, min_datapoints=10)


# Get target coin data
residuals_df = pd.DataFrame()
results = pd.DataFrame()
for coin in tqdm(filtered_coins):
    target_coin_data = final_df[final_df.Coin == coin][['dateTime', 'close_pct']].rename(columns={'close_pct': coin})
    # Merge target coin with market factors
    regression_data = pd.merge(target_coin_data, independent_variables, on='dateTime', how='inner')

    # Step 2: Choose your target coin
    y = regression_data[coin]

    # # # Step 3: Choose market factors (BTC, ETH, etc.)
    X = regression_data[independent]
    # X = sm.add_constant(X)  # Adds intercept

    # # # Step 4: Fit linear model
    model = sm.OLS(y, X).fit()

    # # # Step 5: Get residuals (these are your de-BTCified returns)
    residuals = model.resid
    t_stat, p_value = ttest_1samp(residuals, 0, alternative='greater')

    # Update residuals dataframe
    temp_residuals = pd.DataFrame({
        'dateTime': regression_data['dateTime'],
        'coin': coin,
        'residual': residuals
    })
    
    if residuals_df.empty:
        residuals_df = temp_residuals
    else:
        residuals_df = pd.concat([residuals_df, temp_residuals], ignore_index=True)
    
    # Update results dataframe
    temp_results = pd.DataFrame({
        'coin': [coin],
        'r_squared': [model.rsquared],
        'adj_r_squared': [model.rsquared_adj],
        'f_statistic': [model.fvalue],
        'f_pvalue': [model.f_pvalue],
        'aic': [model.aic],
        'bic': [model.bic],
        'residual_mean': [residuals.mean()],
        'residual_std': [residuals.std()],
        't_statistic': [t_stat],
        'p_value': [p_value]
    })
    
    # Add coefficients for each independent variable
    for i, var in enumerate(independent):
        temp_results[f'{var}_coeff'] = [model.params[var]]
        temp_results[f'{var}_pvalue'] = [model.pvalues[var]]
    
    # Add intercept coefficient
    # temp_results['intercept_coeff'] = [model.params['const']]
    # temp_results['intercept_pvalue'] = [model.pvalues['const']]
    
    if results.empty:
        results = temp_results
    else:
        results = pd.concat([results, temp_results], ignore_index=True)
    
    

In [None]:
comparison = final_df[final_df.Coin.isin(['ORCAUSDT','BTCUSDT'])][['Coin','dateTime','close_pct']].pivot_table(index = 'dateTime',columns='Coin',values='close_pct')
comparison = comparison[comparison.index>min_date]
print(comparison.corr())
comparison.plot(figsize=(15,5))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(12, 8))

# Get unique coins and create a color map
unique_coins = selected_coins
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_coins)))

# Create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Scatter plot on the left
for i, coin in enumerate(unique_coins):
    coin_data = residuals_df[residuals_df['coin'] == coin]
    ax1.scatter(coin_data['dateTime'], coin_data['residual'], 
               color=colors[i], label=coin, alpha=0.7, s=20)

ax1.set_xlabel('Date Time')
ax1.set_ylabel('Residuals')
ax1.set_title('Residuals by Coin Over Time')
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.tick_params(axis='x', rotation=45)

# Histogram on the right
for i, coin in enumerate(unique_coins):
    coin_data = residuals_df[residuals_df['coin'] == coin]
    ax2.hist(coin_data['residual'], bins=30, alpha=0.5, 
            color=colors[i], label=coin, density=True)

ax2.set_xlabel('Residual Value')
ax2.set_ylabel('Density')
ax2.set_title('Distribution of Residuals by Coin')
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
result = (
    residuals_df
    .groupby('coin')
    .agg({
        'residual':['mean', 'std'],
        'dateTime':'nunique'
    })
)
result.columns = ['_'.join(col).strip() for col in result.columns.values]
# Calculate p-values for each coin using one-sample t-test
from scipy.stats import ttest_1samp
import scipy.stats as scipy_stats
import numpy as np

result_with_pvalues = result.copy()
p_values = []

for coin in result.index:
    coin_residuals = residuals_df[residuals_df['coin'] == coin]['residual']
    
    # Use the method from file_context_0
    mean = coin_residuals.mean()
    threshold = 0
    n = len(coin_residuals)
    sd = coin_residuals.std()
    
    # Calculate t-statistic
    t_stat = (mean - threshold) / (sd / np.sqrt(n))
    
    # Calculate p-value based on test type
    # One-sided test: H0: mean <= threshold, H1: mean > threshold
    p_value = 1 - scipy_stats.t.cdf(t_stat, df=n-1)
    
    p_values.append(p_value)

result_with_pvalues['p_value'] = p_values
result_with_pvalues

In [None]:

summary = model.summary()
print(summary)
# Get key statistics from the summary
print("R-squared:", model.rsquared)
print("Adjusted R-squared:", model.rsquared_adj)
print("F-statistic:", model.fvalue)
print("F-statistic p-value:", model.f_pvalue)
print("AIC:", model.aic)
print("BIC:", model.bic)

# Get coefficient information
print("\nCoefficients:")
print(model.params)
print("\nP-values:")
print(model.pvalues)
print("\nStandard errors:")
print(model.bse)
print("\nConfidence intervals:")
print(model.conf_int())

# Get residual statistics
print("\nResidual statistics:")
print("Mean of residuals:", model.resid.mean())
print("Standard deviation of residuals:", model.resid.std())

# BTC LAGGED ANALYSIS

In [None]:
prediction_vars.Coin.unique()

In [None]:

prediction_vars = final_df[final_df.Coin.isin(independent)][['dateTime', 'Coin', 'close_pct']]

def make_lagged_columns(df, lag_from = 1, lag_to = 5):
    temp = pd.DataFrame()
    temp['dateTime'] = df.dateTime.unique()
    for coin in df.Coin.unique():
        coin_df = df[df.Coin == coin][['dateTime','close_pct']]
        for i in range(lag_from, lag_to+1):
            temp[f'lag_{coin}_{i}'] = coin_df['close_pct'].shift(i).values
            
    return temp
lagged_features = make_lagged_columns(prediction_vars, 1, 100)
lagged_features

In [None]:

joined_df = final_df[['Coin','dateTime','close_pct']].merge(lagged_features, on = 'dateTime', how = 'right').fillna(0)
joined_df
coin = 'PENDLEUSDT'
corr = 'BTCUSDT'
correlations = pd.DataFrame()
for coin in tqdm(joined_df.Coin.unique()):
    for corr in prediction_vars.Coin.unique():
        result = joined_df[joined_df.Coin == coin]
        result = result[['close_pct']+[col for col in result.columns if corr in col]].corr()['close_pct'].reset_index()
        result = result[result.close_pct < 1]
        result['index'] = result['index'].apply(lambda x: x.split('_')[2])
        result['Coin'] = coin
        result['Corr'] = corr
        correlations = pd.concat([correlations, result])

correlations
# for coin in joined_df.Coin.unique()[0:2]:
#     coin_df = joined_df[joined_df.Coin == coin]
#     print(coin_df)


In [None]:
import matplotlib.pyplot as plt

# Create a figure with 4 subplots
fig, axes = plt.subplots(2, 2, figsize=(20, 12))
fig.suptitle('Time Series Correlation Analysis', fontsize=16)

# Flatten axes for easier iteration
axes = axes.flatten()

# Plot for each reference coin
for idx, corr in enumerate(['BTCUSDT', 'ETHUSDT', 'BNBUSDT', 'XRPUSDT']):
    # Get data for this reference coin
    corr_data = correlations[correlations.Corr == corr]    
    # Plot each coin's correlation as a line
    for coin in corr_data.Coin.unique():
        coin_data = corr_data[corr_data.Coin == coin]
        if abs(coin_data.close_pct.mean()) > 0.01:
            axes[idx].scatter(coin_data['index'], coin_data['close_pct'], 
                      label=coin, alpha=0.1)
    
    axes[idx].set_title(f'Correlation with {corr}')
    axes[idx].set_xlabel('Lag')
    axes[idx].set_ylabel('Correlation')
    axes[idx].grid(True, alpha=0.3)
    # axes[idx].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Set x-ticks to show every 5th tick
    x_ticks = axes[idx].get_xticks()
    axes[idx].set_xticks(x_ticks[::5])

plt.tight_layout()
plt.show()

In [None]:
coin = 'PENDLEUSDT'
coin_df = final_df[final_df.Coin == coin][['dateTime','close','close_pct']]
feature_df = coin_df.merge(lagged_features, on = 'dateTime', how = 'right').fillna(0)


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
from sklearn.preprocessing import StandardScaler
import numpy as np

# Start MLflow experiment
mlflow.set_experiment(f"Crypto_Price_Prediction_{coin}")

# Define models and their parameter grids - simplified to avoid multiprocessing issues
models_and_params = {
    'OLS': {
        'model': LinearRegression(),
        'params': {}
    },
    'DecisionTree': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42, n_jobs=1),  # Single thread
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [5, 10],
            'min_samples_split': [2, 5]
        }
    },
    'XGBoost': {
        'model': xgb.XGBRegressor(random_state=42, verbosity=0, n_jobs=1),  # Single thread
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        }
    },
    'LightGBM': {
        'model': lgb.LGBMRegressor(random_state=42, verbosity=-1, n_jobs=1),  # Single thread
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        }
    }
}

y = feature_df['close_pct']
X = feature_df[[col for col in feature_df.columns if 'lag' in col]]

# Remove rows with NaN values
mask = ~(y.isna() | X.isna().any(axis=1))
y_clean = y[mask]
X_clean = X[mask]

# Scale features for models that need it
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42, shuffle=False)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Number of features: {X_train.shape[1]}")

# Store results
results = []

# Grid search for each model
for model_name, model_config in models_and_params.items():
    print(f"\nTraining {model_name}...")
    
    with mlflow.start_run(run_name=f"{coin}_{model_name}"):
        # Log basic info
        mlflow.log_param("coin", coin)
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("n_features", X_train.shape[1])
        mlflow.log_param("train_samples", len(X_train))
        mlflow.log_param("test_samples", len(X_test))
        
        # Perform grid search with cross-validation - single threaded to avoid multiprocessing issues
        if model_config['params']:  # If there are parameters to tune
            grid_search = GridSearchCV(
                model_config['model'], 
                model_config['params'], 
                cv=3,  # Reduced CV folds
                scoring='neg_mean_squared_error',
                n_jobs=1,  # Single thread
                verbose=0
            )
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            
            # Log best parameters
            for param, value in grid_search.best_params_.items():
                mlflow.log_param(f"best_{param}", value)
            
            # Log CV score
            mlflow.log_metric("cv_score", -grid_search.best_score_)
            
        else:  # For OLS (no parameters to tune)
            best_model = model_config['model']
            best_model.fit(X_train, y_train)
        
        # Make predictions
        y_pred_train = best_model.predict(X_train)
        y_pred_test = best_model.predict(X_test)
        
        # Calculate metrics
        train_r2 = r2_score(y_train, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)
        train_mse = mean_squared_error(y_train, y_pred_train)
        test_mse = mean_squared_error(y_test, y_pred_test)
        train_mae = mean_absolute_error(y_train, y_pred_train)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        
        # Log metrics
        mlflow.log_metric("train_r2", train_r2)
        mlflow.log_metric("test_r2", test_r2)
        mlflow.log_metric("train_mse", train_mse)
        mlflow.log_metric("test_mse", test_mse)
        mlflow.log_metric("train_mae", train_mae)
        mlflow.log_metric("test_mae", test_mae)
        mlflow.log_metric("train_rmse", np.sqrt(train_mse))
        mlflow.log_metric("test_rmse", np.sqrt(test_mse))
        
        # Log model with input example to avoid warnings
        input_example = X_train.iloc[:5] if hasattr(X_train, 'iloc') else X_train[:5]
        
        if model_name == 'XGBoost':
            mlflow.xgboost.log_model(best_model, "model", input_example=input_example)
        elif model_name == 'LightGBM':
            mlflow.lightgbm.log_model(best_model, "model", input_example=input_example)
        else:
            mlflow.sklearn.log_model(best_model, "model", input_example=input_example)
        
        # Store results
        results.append({
            'model': model_name,
            'train_r2': train_r2,
            'test_r2': test_r2,
            'train_mse': train_mse,
            'test_mse': test_mse,
            'train_mae': train_mae,
            'test_mae': test_mae,
            'best_model': best_model
        })
        
        print(f"{model_name} - Test R²: {test_r2:.4f}, Test MSE: {test_mse:.4f}, Test MAE: {test_mae:.4f}")


In [None]:
model.resid
import matplotlib.pyplot as plt
import seaborn as sns
sns.kdeplot(model.resid, )
plt.title('Kernel Density Plot of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.show()