In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

def preprocess(df, split_column, drop_first=True):
    '''
    Does the usual preprocessing steps on a pandas dataframe:
    1. Does one-hot encoding
    2. Standardizes the data
    3. Drop N/A
    4. Splits the data into x & y along split_column
    5. Returns (x, y)
    '''
    stds = StandardScaler().set_output(transform='pandas')
    df = stds.fit_transform(pd.get_dummies(df, drop_first=drop_first)).dropna()
    x = df.drop(split_column, axis=1)
    return (x, df[split_column][x.index])

def eliminate_outliers(X, y):
    '''Removes outliers using the IQR method'''
    Q1 = y.quantile(0.25)
    Q3 = y.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    non_outliers = (y >= lower_bound) & (y <= upper_bound)
    X = X[non_outliers]
    y = y[non_outliers]
    return (X, y)

def great_big_preprocessing_chain(df, split_column):
    '''Takes care of all the preprocessing stuff where it can get garbage collected'''
    (X, y) = preprocess(df, split_column)
    (X, y) = eliminate_outliers(X, y)
    (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, random_state=42)
    pca = PCA()
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    return (X_train, X_test, y_train, y_test)

In [2]:
import numpy as np
import sqlite3

In [3]:
X_train, X_test, y_train, y_test = great_big_preprocessing_chain(pd.read_csv('resources/final_data.csv'), 'goals')

In [4]:
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

grid = {
    'n_estimators': [10, 100],
    'max_depth': [3, 4, 5]
}

rf = GridSearchCV(RandomForestRegressor(random_state=42),param_grid=grid,return_train_score=True,n_jobs=-1)
rf.fit(X_train,y_train)

cv_results = cross_validate(rf,X_train,y_train,return_train_score=True)
R2_trainCV = cv_results['train_score'].mean()
R2_valid   = cv_results['test_score'].mean()
predictions = rf.predict(X_test)

mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print('train R2 (CV) =',R2_trainCV,'  valid R2 =',R2_valid)
print()
R2_train = rf.score(X_train,y_train)
R2_test  = rf.score(X_test,y_test)
print('     train R2 =',R2_train,'    test R2 =',R2_test)
print('mse = ' + str(mse))
print('mae = ' + str(mae))

Exception ignored in: <function ResourceTracker.__del__ at 0x7f41115a3880>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7e2f370a7880>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/usr/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7f8381aab880>
Traceback (most recent call last):
  File "/usr/lib/python3.13/multiprocessing/reso

train R2 (CV) = 0.540061364581745   valid R2 = 0.44185433455741185

     train R2 = 0.5254639794546847     test R2 = -0.1701987770410558
mse = 0.256282397474278
mae = 0.3914499708983176
