In [1]:
from gplearn import genetic, functions, fitness
from gplearn.genetic import SymbolicRegressor, SymbolicTransformer

import numpy as np
import pandas as pd
import sympy

import warnings 
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
csv_file = '/Users/shiyili/projects/kaggle/train.csv'

df = pd.read_csv(csv_file, nrows=10000)
df = df[~df['target'].isnull()] # remove rows with no target

df['far_price'] = df['far_price'].fillna(0)
df['near_price'] = df['near_price'].fillna(0)

# check if there is any null value in each col 
# print("Null values", df.isnull().sum())
x_train = df[[col for col in df.columns if col not in ['target', 'row_id']]].values
y_train = df['target'].values

df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,0.0,0.0,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,0.0,0.0,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,0.0,0.0,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,0.0,0.0,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,0.0,0.0,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [3]:
x_train.shape, y_train.shape

((10000, 15), (10000,))

In [4]:
func = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min']

st_gplearn = SymbolicTransformer(
    generations=20, 
    population_size=2000,
    hall_of_fame=100,
    n_components=10,
    function_set=func,
    parsimony_coefficient=0.0005,
    max_samples=0.9,
    verbose=1,
    random_state=0,
    n_jobs=4
    )

st_gplearn.fit(x_train, y_train)
best_prog_dict = {}
best_programs = st_gplearn._best_programs

for bp in best_programs:
    name = 'alpha_' + str(best_programs.index(bp) + 1)
    best_prog_dict[name] = {
        'fitness': bp.fitness_,
        'expression': str(bp),
        'length': bp.length_,
        'depth': bp.depth_,
    }

df_programs = pd.DataFrame(best_prog_dict).T
df_programs.sort_values(by='fitness', ascending=False, inplace=True)
df_programs.drop_duplicates(subset=['expression'], keep='first', inplace=True)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    10.79        0.0246564        6         0.200025         0.166857      1.80m
   1     8.08        0.0590649        8         0.200311         0.165203     42.39s
   2     6.60        0.0849335        8         0.211747           0.1898     41.48s
   3     6.91        0.0971736        8         0.215621          0.15506     41.18s
   4     6.59        0.0996789       10         0.218169          0.13404     38.75s
   5     6.44         0.101717        9         0.223296        0.0823432     32.30s
   6     7.37         0.111985       11         0.223778        0.0855637     30.71s
   7     9.16         0.125516       14         0.226036        0.0891304     28.49s
   8    10.41         0.130991       15         0.227853        0.0746904  

In [5]:
df_programs

Unnamed: 0,fitness,expression,length,depth
alpha_1,0.242626,"add(X5, sub(div(sub(sub(X11, X13), X13), abs(X...",12,5
alpha_2,0.239212,"add(abs(X5), sub(add(abs(X5), sub(div(sub(sub(...",18,7
alpha_3,0.236697,"div(sub(sub(add(X5, sub(X11, X13)), X13), X13)...",12,5
alpha_5,0.22775,"add(abs(X5), sub(add(X5, sub(div(sub(sub(X11, ...",17,7
alpha_10,0.226116,"div(sub(sub(add(abs(X5), sub(X11, X13)), X13),...",18,5
alpha_9,0.22479,"add(X5, sub(div(sub(add(add(X5, sub(X11, X13))...",21,7
alpha_6,0.223931,"add(X5, sub(add(abs(abs(X5)), sub(add(abs(X9),...",24,9
alpha_7,0.22344,"add(X5, sub(div(sub(sub(add(abs(X5), sub(X11, ...",24,8
alpha_8,0.223356,"add(X5, sub(div(sub(sub(add(sub(sub(X11, X13),...",24,9


In [48]:
df_programs.loc['alpha_6', 'expression']

'add(X5, sub(add(abs(abs(X5)), sub(add(abs(X9), sub(div(sub(max(X11, X13), abs(X5)), abs(X9)), X13)), X13)), X13))'

In [21]:
converter = {
    'sub': lambda x, y : x - y,
    'div': lambda x, y : x/y,
    'mul': lambda x, y : x*y,
    'add': lambda x, y : x + y,
    'neg': lambda x    : -x,
    'pow': lambda x, y : x**y,
    'abs': lambda x    : abs(x),
    'max': lambda x, y : max(x, y),
    'min': lambda x, y : min(x, y),
}

In [20]:
x_feature_labels = df[[col for col in df.columns if col not in ['target', 'row_id']]].columns

x_feature_labels

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'time_id'],
      dtype='object')

In [None]:
# get the variables used in the expression
for row in df_programs.iterrows():
    expr = sympy.sympify(row[1]['expression'], locals=converter)
    variables = list(expr.free_symbols)
    for var in variables:
        expr = expr.subs(var, x_feature_labels[int(str(var)[1:])])
    
    print(row[1].name, expr)


In [51]:
def expr2equation(expr_str, label_names, converter):
    expr = sympy.sympify(expr_str, locals=converter)
    variables = list(expr.free_symbols)
    for var in variables:
        expr = expr.subs(var, label_names[int(str(var)[1:])])

    return expr

In [54]:
expr2equation(df_programs.loc['alpha_1', 'expression'], x_feature_labels, converter)

reference_price - wap + (ask_price - 2*wap)/Abs(bid_price)

In [55]:
expr2equation(df_programs.loc['alpha_2', 'expression'], x_feature_labels, converter)

-2*wap + (ask_price - reference_price - wap)/Abs(bid_price) + 2*Abs(reference_price)

In [56]:
expr2equation(df_programs.loc['alpha_3', 'expression'], x_feature_labels, converter)

(ask_price + reference_price - 3*wap)/Abs(bid_price)

In [57]:
expr2equation(df_programs.loc['alpha_5', 'expression'], x_feature_labels, converter)

reference_price - 2*wap + (ask_price - 2*wap)/Abs(bid_price) + Abs(reference_price)

In [58]:
expr2equation(df_programs.loc['alpha_10', 'expression'], x_feature_labels, converter)

(ask_price - 2*wap + Abs(reference_price) - Abs(ask_price - 2*wap))/Abs(bid_price)

In [59]:
expr2equation(df_programs.loc['alpha_9', 'expression'], x_feature_labels, converter)

reference_price - wap + (2*ask_price + reference_price - 3*wap - Abs(reference_price))/Abs(bid_price)

In [60]:
expr2equation(df_programs.loc['alpha_7', 'expression'], x_feature_labels, converter)

reference_price - wap + (ask_price - 2*wap + Abs(reference_price) - Abs(ask_price - 3*wap))/Abs(bid_price)

In [61]:
expr2equation(df_programs.loc['alpha_8', 'expression'], x_feature_labels, converter)

reference_price - wap + (2*ask_price - 4*wap + Abs(bid_price) + Abs(reference_price))/Abs(bid_price)