In [None]:
from DistilSR import *
exp_set = get_exp_set(3)
print(len(exp_set))
exp_set

In [None]:
from scipy.optimize import minimize
from tqdm import tqdm
import itertools
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pmlb import fetch_data, regression_dataset_names
import time
import re
import multiprocessing
from collections.abc import Iterable

def cost(x, xdata, ydata, lambda_string): 
    y_pred = eval(lambda_string)(x, xdata)
    return np.mean(((y_pred - ydata))**2) 

def cost_special(x, xdata, ydata, lambda_string,T): 
    y_pred = eval(lambda_string)(x, xdata)
    loss_list = (y_pred - ydata)**2
    return (np.mean(np.abs(T-loss_list)), sum(T>=loss_list)/len(ydata) )

def cust_pred(x, xdata, ydata, lambda_string): 
    y_pred = eval(lambda_string)(x, xdata)
    return y_pred

def save_strings_to_file(strings, filename):
    with open(filename, 'w') as file:
        for string in strings:
            file.write(string + '\n')

def load_strings_from_file(filename):
    with open(filename, 'r') as file:
        for line in file:
            yield line.strip()
    
total_time = 0

def process_eq(chunk_idx, dataset_name, train_X, train_y, test_X, test_y,random_state):
    results = []
    total_time_chunk = 0
    eq_list_chunk = load_strings_from_file(f'strings_data_TEMP_chunk_{chunk_idx}.txt')
    for test_eq in eq_list_chunk:
        ERC_count = test_eq.count("x[")
        lambda_string = "lambda x,xdata:" + test_eq
        if ERC_count:
            try:
                np.random.seed(random_state)
                start_time = time.time()
                res = minimize(cost,
                               x0=np.random.randint(1, 31, size=ERC_count)/10,
                               args=(train_X.T, train_y, lambda_string),
                               method="BFGS", options={"maxiter": 500})
                total_time_chunk+=time.time() - start_time
                optimized_cost = cost(res.x, train_X.T, train_y, lambda_string) #train loss
                results.append((lambda_string, res.x, res.nit, optimized_cost))
            except RuntimeError:
                results.append((lambda_string, None, None, None))
        else:
            optimized_cost = cost(None, train_X.T, train_y, lambda_string)
            results.append((lambda_string, None, None, optimized_cost))
    return results, total_time_chunk

def process_eq_one_out(chunk_idx, dataset_name, train_X, train_y, test_X, test_y,random_state):
    results = []
    total_time_chunk = 0
    eq_list_chunk = load_strings_from_file(f'strings_data_TEMP_chunk_{chunk_idx}.txt')
    for test_eq in eq_list_chunk:
        ERC_count = test_eq.count("x[")
        lambda_string = "lambda x,xdata:" + test_eq
        if ERC_count:
            try:
                np.random.seed(random_state)
                start_time = time.time()
                res = minimize(cost,
                               x0=np.random.randint(1, 31, size=ERC_count)/10,
                               args=(train_X[:-1].T, train_y[:-1], lambda_string),
                               method="BFGS", options={"maxiter": 500})
                total_time_chunk+=time.time() - start_time
                optimized_cost = cost(res.x, train_X[:-1].T, train_y[:-1], lambda_string)
                results.append((lambda_string, res.x, res.nit, optimized_cost))
            except RuntimeError:
                results.append((lambda_string, None, None, None))
        else:
            optimized_cost = cost(None, train_X[:-1].T, train_y[:-1], lambda_string)
            results.append((lambda_string, None, None, optimized_cost))
    return results, total_time_chunk

def test_SR(dataset_name,random_state=0):
    df = pd.read_csv("SRSDdatasets/"+dataset_name+".txt", sep=' ', header=None)
    total_time = 0
    X, y = np.array(df.iloc[:,:-1]), np.array(df.iloc[:,-1])
    train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=random_state,test_size=(8000-20)/8000)
    np.random.seed(random_state)
    shuffle_idx = np.random.permutation(len(train_y))
    train_X = train_X[shuffle_idx]
    train_y = train_y[shuffle_idx]
    master_list=[]
    eq_list = []
    num_of_feature = train_X.shape[1]
    mse_tuple = tuple()
    for test_eq in tqdm(exp_set):
        test_eq_orig = test_eq
        R_count = test_eq.count("R")
        
        for combi_var in itertools.product(range(num_of_feature+1), repeat=R_count):
            test_eq=test_eq_orig
            for i in combi_var:
                if i==num_of_feature:
                    test_eq = test_eq.replace("R", "erc", 1)
                else:
                    test_eq = test_eq.replace("R", f"xdata[{i}]", 1)
            match = re.search(r"\w{3}\(erc,erc\)", test_eq)
            while match:
                test_eq = test_eq.replace(match.group(),"erc")
                match = re.search(r"\w{3}\(erc,erc\)", test_eq)
            ERC_count = test_eq.count("erc")
            for i in range(ERC_count):
                test_eq = test_eq.replace("erc", f"x[{i}]", 1)
            eq_list.append(test_eq)
    eq_list = list(set(eq_list))
    num_processes = multiprocessing.cpu_count()-1
    eq_list_len = len (eq_list)
    chunk_size = eq_list_len // num_processes
    for idx, i in enumerate(range(0, eq_list_len, chunk_size)):
        save_strings_to_file(eq_list[i:i + chunk_size], f'strings_data_TEMP_chunk_{idx}.txt')
    del eq_list

    with multiprocessing.Pool(processes=num_processes) as pool:
        results = pool.starmap(process_eq, [(chunk_idx, dataset_name, train_X, train_y, test_X, test_y,random_state) for chunk_idx in range(num_processes)])

    for result_chunk, total_time_chunk in results:
        master_list.extend(result_chunk)
        total_time+=total_time_chunk
    print(total_time)

    pd.DataFrame(master_list).to_csv("TEMP_checker1.csv")
    temp_holder = min(master_list,key = lambda x: x[3])
    print(f"{temp_holder=}")
    
    master_list=[]
    eq_list = []
    num_of_feature = train_X.shape[1]
    mse_tuple = tuple()
    for test_eq in tqdm(exp_set):
        test_eq_orig = test_eq
        R_count = test_eq.count("R")
        
        for combi_var in itertools.product(range(num_of_feature+1), repeat=R_count):
            test_eq=test_eq_orig
            for i in combi_var:
                if i==num_of_feature:
                    test_eq = test_eq.replace("R", "erc", 1)
                else:
                    test_eq = test_eq.replace("R", f"xdata[{i}]", 1)
            match = re.search(r"\w{3}\(erc,erc\)", test_eq)
            while match:
                test_eq = test_eq.replace(match.group(),"erc")
                match = re.search(r"\w{3}\(erc,erc\)", test_eq)
            ERC_count = test_eq.count("erc")
            for i in range(ERC_count):
                test_eq = test_eq.replace("erc", f"x[{i}]", 1)
            eq_list.append(test_eq)
    eq_list = list(set(eq_list))
    num_processes = multiprocessing.cpu_count()*2//3
    eq_list_len = len (eq_list)
    chunk_size = eq_list_len // num_processes
    for idx, i in enumerate(range(0, eq_list_len, chunk_size)):
        save_strings_to_file(eq_list[i:i + chunk_size], f'strings_data_TEMP_chunk_{idx}.txt')
    del eq_list

    with multiprocessing.Pool(processes=num_processes) as pool:
        results = pool.starmap(process_eq_one_out, [(chunk_idx, dataset_name, train_X, train_y, test_X, test_y,random_state) for chunk_idx in range(num_processes)])

    for result_chunk, total_time_chunk in results:
        master_list.extend(result_chunk)
        total_time+=total_time_chunk
    print(total_time)

    pd.DataFrame(master_list).to_csv("TEMP_checker2.csv")
    
    temp_holder2 = min(master_list,key = lambda x: x[3])
    print(f"{temp_holder2=}")

    temp_holder_main = (cost(temp_holder[1], test_X.T, test_y, temp_holder[0]),
                        cost(temp_holder[1], train_X[[-1]].T, train_y[[-1]], temp_holder[0]), #V(f_{SUz},z)
                       )
    temp_holder_main2 = (temp_holder2[3], #train loss
                          cost(temp_holder2[1], test_X.T, test_y, temp_holder2[0]), #test loss
                          cost(temp_holder2[1], train_X[[-1]].T, train_y[[-1]], temp_holder2[0]), #V(f_S,z)
                         ) + cost_special(temp_holder2[1], train_X[:-1].T, train_y[:-1], temp_holder2[0], temp_holder_main[1]) #|V(f_{SUz},z)-V(f_S,z_i)|

    y_std = np.std(test_y)
    try:
        test_r2 = r2_score(test_y,cust_pred(temp_holder2[1], test_X.T, test_y, temp_holder2[0])), #test r2
    except:
        test_r2 = None
    temp_final = (np.abs(temp_holder_main2[1]-temp_holder_main2[0]), #Raw generalization gap
                  np.abs(temp_holder_main2[2]-temp_holder_main[1]), #Raw APG
                  temp_holder_main2[3], #Raw ATG
                  np.abs(temp_holder_main2[1]-temp_holder_main2[0])/(y_std**2), #generalization gap normalized
                  np.abs(temp_holder_main2[2]-temp_holder_main[1])/(y_std**2), #APG normalized
                  temp_holder_main2[3]/(y_std**2), #ATG normalized
                 )

    return temp_holder+temp_holder2+temp_holder_main+temp_holder_main2, temp_final

In [None]:
import os
num_runs = 10
directory = 'SRSDdatasets'
for idx,filename in enumerate(os.listdir(directory)):
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        dataset_name = filename[:-4]
    else:
        continue
    print(f"{dataset_name=}")
    master_df1 = []
    master_df2 = []
    for i in range(num_runs):
        DistilSR_solution = test_SR(dataset_name,random_state=i)
        master_df1.append(list(DistilSR_solution[0]))
        master_df2.append(list(DistilSR_solution[1])) 
    pd.DataFrame(master_df1).to_csv(f"R_{dataset_name}_full_{num_runs}.csv",index=False)
    pd.DataFrame(master_df2).to_csv(f"R_{dataset_name}_{num_runs}.csv",index=False)