In [1]:
# Imports
import os
import numpy as np
import pandas as pd

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import bokeh
import bokeh.plotting
import bokeh.models
import bokeh.layouts
bokeh.io.output_notebook()

# Modelling
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from scipy.stats import randint

In [2]:
# Set path to collect data files
data_path = "../data/clean/"

# Import necessary file
relative_known_function_file = os.path.join(data_path, "relative_OG_with_metadata.csv")
OG_relative_df = pd.read_csv(relative_known_function_file)
OG_relative_df = OG_relative_df.drop('label', axis=1)

In [16]:
r_squareds = []
mean_absolute_errors = []
mean_squared_errors = []
plots = []
for variable in OG_relative_df.iloc[:, 8397:].columns:
    # Only train on samples where we have data
    df = OG_relative_df[OG_relative_df[variable].notna()]
    y = df[variable]
    X = df.iloc[:, :8392]
    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    # Fit RandomForestRegressor
    regr = RandomForestRegressor(n_estimators=500, n_jobs=-1)
    regr.fit(X_train, y_train)
    # Predict the validation set temperatures
    y_pred = regr.predict(X_val)
    # Save metrics
    r2 = metrics.r2_score(y_val, y_pred)
    mae = metrics.mean_absolute_error(y_val, y_pred)
    mse = metrics.mean_squared_error(y_val, y_pred)
    rmse = mse**0.5
    r_squareds.append(r2)
    mean_absolute_errors.append(mae)
    mean_squared_errors.append(rmse)

    # Build plot
    p = bokeh.plotting.figure(
        frame_height=200,
        frame_width=200,
        x_axis_label='Observed',
        y_axis_label='Predicted',
        title=variable + ', R² = ' + "{:.3f}".format(r2)
    )
    p.circle(y_val, y_pred)
    slope = bokeh.models.Slope(gradient=1, y_intercept=0,
                  line_color='darkorange', line_width=2)
    
    p.add_layout(slope)
    plots.append(p)

In [17]:
grid = bokeh.layouts.gridplot(plots, ncols=3)
bokeh.io.show(grid)

In [18]:
# Set path to export plot
data_path = "../plots"
plot_file = os.path.join(data_path, "OG_regression.svg")
bokeh.io.export_svg(grid, filename=plot_file)

plot_file = os.path.join(data_path, "OG_regression.png")
bokeh.io.export_png(grid, filename=plot_file)

'/Users/victoriac/git/20440_TARA/plots/OG_regression.png'