In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def clean_data(data):
    # Check for missing values
    if data.isnull().sum().any():
        data = data.fillna(method='ffill')  # Forward fill as an example, adjust based on domain knowledge
        data = data.fillna(data.median(numeric_only=True))
        print("Missing values filled.")
    numeric_data = data.select_dtypes(include=[np.number])
    # Check for outliers - using z-score for example
    z_scores = np.abs((numeric_data - numeric_data.mean()) / numeric_data.std())
    data = data[(z_scores < 3).all(axis=1)]  # Remove rows with any z-score > 3
    
    return data

In [3]:
def train_model(data):
    X = data[['Days_Since_High_Last_7_Days', '%_Diff_From_High_Last_7_Days',
              'Days_Since_Low_Last_7_Days', '%_Diff_From_Low_Last_7_Days']]
    y = data[['%_Diff_From_High_Next_5_Days', '%_Diff_From_Low_Next_5_Days']]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initial model training
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Model evaluation
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"Initial Model - Mean Squared Error: {mse}")
    print(f"Initial Model - Mean Absolute Error: {mae}")
    
    return model, X_train, y_train

In [4]:
def tune_model(X_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [ 'sqrt', 'log2']
    }
    
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                               scoring='neg_mean_squared_error', cv=3, verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Model Score: {-grid_search.best_score_}")
    
    return grid_search.best_estimator_

In [5]:
def predict_outcomes(model, feature_values):
    return model.predict([feature_values])[0]

In [6]:
# Load and clean data
file_path = './crypto_data_with_metrics.csv'
data = load_data(file_path)
data = clean_data(data)

# Train and tune model
model, X_train, y_train = train_model(data)
best_model = tune_model(X_train, y_train)

# Make predictions
example_features = [1,-0.96, 2,3.04]  # Replace with actual input values as needed 6.0,-1.3167707520907155,1668.69,6.0
# 1, -0.96, 2, 3.04
prediction = predict_outcomes(best_model, example_features)
print(f"Predicted values: {prediction}")

  data = data.fillna(method='ffill')  # Forward fill as an example, adjust based on domain knowledge


Missing values filled.
Initial Model - Mean Squared Error: 20.68507292721813
Initial Model - Mean Absolute Error: 3.4823547571038667
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best Model Score: 21.85991998800142
Predicted values: [-7.25567124  3.99861909]


  _data = np.array(data, dtype=dtype, copy=copy,
