In [1]:
import openml
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [2]:
dataset = openml.datasets.get_dataset(706)

X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)

X.head()

Unnamed: 0,bsal,sal77,fsex,senior,age,educ
0,5040,12420.0,0,96,329,15
1,6300,12060.0,0,82,357,15
2,6000,15120.0,0,67,315,15
3,6000,16320.001,0,97,354,12
4,6000,12300.0,0,66,351,12


In [3]:
categorical_cols = ['fsex', 'educ']
X_processed = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
numerical_cols = ['bsal', 'sal77', 'senior', 'age']
scaler = StandardScaler()
X_processed[numerical_cols] = scaler.fit_transform(X_processed[numerical_cols])
X = X_processed.values.astype(np.float64)

In [4]:
y.head()

0    14.0
1    72.0
2    35.5
3    24.0
4    56.0
Name: exper, dtype: float64

In [5]:
y = y.apply(pd.to_numeric, errors='coerce')
y = (y - y.mean()) / y.std()

In [6]:
transformer = SymbolicTransformer()
transformer.fit(X, y)

In [7]:
X_transformed = transformer.transform(X)
X = np.hstack((X, X_transformed))
X = pd.DataFrame(X)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-0.538882,1.138823,1.345209,-1.042618,0.0,0.0,0.0,1.0,0.0,-3.127853,3.127853,3.127853,3.127853,-3.127853,3.127853,3.127853,-3.127853,2.085235,-2.085235
1,1.246422,0.936575,-0.02741,-0.841835,0.0,0.0,0.0,1.0,0.0,-2.525506,2.525506,2.525506,2.525506,-2.525506,2.525506,2.525506,-2.525506,1.683671,-1.683671
2,0.82135,2.655682,-1.498074,-1.143009,0.0,0.0,0.0,1.0,0.0,-3.429026,3.429026,3.429026,3.429026,-3.429026,3.429026,3.429026,-3.429026,2.286017,-2.286017
3,0.82135,3.329843,1.443253,-0.863348,0.0,0.0,1.0,0.0,0.0,-2.590043,2.590043,2.590043,2.590043,-2.590043,2.590043,2.590043,-2.590043,1.726696,-1.726696
4,0.82135,1.071407,-1.596118,-0.88486,0.0,0.0,1.0,0.0,0.0,-2.654581,2.654581,2.654581,2.654581,-2.654581,2.654581,2.654581,-2.654581,1.76972,-1.76972


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
est_gp = SymbolicRegressor()
est_gp.fit(X_train, y_train)

In [10]:
print("R2 Score of test samples whit the model: ", r2_score(y_test, est_gp.predict(X_test)))

R2 Score of test samples whit the model:  0.6999982175996392
