In [1]:
import openml
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
dataset = openml.datasets.get_dataset(665)

X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)

X.head()

Unnamed: 0,FM,LC,BK,SS,AG,YR
0,0,1,1,0,59,39
1,0,0,0,0,61,42
2,0,0,1,0,59,36
3,1,1,1,0,60,38
4,1,0,1,0,60,36


In [3]:
X = pd.get_dummies(X, columns=['FM', 'LC', 'BK', 'SS'], drop_first=True)
X = X.apply(pd.to_numeric, errors='coerce')
X = (X - X.mean()) / X.std()

In [4]:
y.head()

0    20
1    12
2    15
3    15
4    25
Name: CD, dtype: uint8

In [5]:
y = y.apply(pd.to_numeric, errors='coerce')
y = (y - y.mean()) / y.std()

In [6]:
transformer = SymbolicTransformer()
transformer.fit(X, y)

In [7]:
X_transformed = transformer.transform(X)
X = np.hstack((X, X_transformed))
X = pd.DataFrame(X)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.27678,0.79779,-0.567554,1.409395,1.088994,-0.661948,0.855991,0.855991,0.855991,0.855991,0.855991,0.855991,0.855991,0.855991,0.855991,0.855991
1,0.548931,1.012448,-0.567554,-0.704698,-0.912032,-0.661948,1.326218,1.326218,1.326218,1.326218,1.326218,1.326218,1.326218,1.326218,1.326218,1.326218
2,0.27678,0.583131,-0.567554,-0.704698,1.088994,-0.661948,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233
3,0.412855,0.726237,1.74996,1.409395,1.088994,-0.661948,0.736728,0.736728,0.736728,0.736728,0.736728,0.736728,0.736728,0.736728,0.736728,0.736728
4,0.412855,0.583131,1.74996,-0.704698,1.088994,-0.661948,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233,0.541233


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
est_gp = SymbolicRegressor()
est_gp.fit(X_train, y_train)

In [10]:
print("R2 Score of test samples whit the model: ", r2_score(y_test, est_gp.predict(X_test)))

R2 Score of test samples whit the model:  0.4252962901595214
