In [42]:
import pandas as pd
import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score, mean_absolute_error

In [51]:
data = load_boston()
X_train = data['data'][:300]
y_train = data['target'][:300]

X_test = data['data'][300:]
y_test = data['target'][300:]

print(data['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [52]:
gp = SymbolicRegressor(verbose=1, 
                       function_set=('add', 'sub', 'mul', 'div', 'log', 'sqrt'), 
                       population_size=1000, 
                       generations=80,
                       max_samples=0.8)
gp.fit(X_train, y_train)



    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    14.58 14730121.599472078       46 6.062240575799747 5.954076169096148     39.80s
   1    12.09 221.54897262395883       66 5.460517535579132 5.790779137856263      1.29m
   2    18.02 103.91112805905072       66 5.137327762011428 4.349376415896654      1.52m
   3    45.38 448.45985855204043       66 4.741948331223796 5.941650802962503      2.04m
   4    60.62 664.1622239312183       66 4.815957952943973 5.634855652166478      2.44m
   5    68.74 55.86687663724198       61 4.602560741361316 6.366205255944542      2.81m
   6    69.58 58.64847265289478       64 4.617593202172254 6.31316098210795      2.93m
   7    71.35 58.15641876910499       73 4.527209827094738 6.7623243272059375      3.03m
   8    73.99 55.40338600444069       68 4.5557

SymbolicRegressor(const_range=(-1.0, 1.0),
         function_set=('add', 'sub', 'mul', 'div', 'log', 'sqrt'),
         generations=80, init_depth=(2, 6), init_method='half and half',
         max_samples=0.8, metric='mean absolute error', n_jobs=1,
         p_crossover=0.9, p_hoist_mutation=0.01, p_point_mutation=0.01,
         p_point_replace=0.05, p_subtree_mutation=0.01,
         parsimony_coefficient=0.001, population_size=1000,
         random_state=None, stopping_criteria=0.0, tournament_size=20,
         verbose=1, warm_start=False)

In [53]:
pred = gp.predict(X_test)

In [54]:
print(r2_score(pred, y_test),mean_absolute_error(pred, y_test))

0.28796629763218495 6.214035792502178


In [56]:
pd.DataFrame({'price': y_test, 'prediction': pred})

Unnamed: 0,price,prediction
0,24.8,34.279742
1,22.0,25.683725
2,26.4,25.004537
3,33.1,31.449201
4,36.1,26.325049
5,28.4,23.975589
6,33.4,28.586330
7,28.2,26.016218
8,22.8,34.184760
9,20.3,21.384526
