# Final Benchmark Data Generation

### Dataset 1 Formula:

Y = w1*x1 + w2*x2 + ... + wN*xN + iw1*I1 + iw2*I2 + ... + iwM*IM + Noise

Where I1, I2, ... are randomly chosen interaction terms xA*xB

In [188]:
import numpy as np
import pandas as pd

In [189]:
n_rows = 15

# x1, x2, ... , xN
n_features = 10

# x1*x5, x4*x17, x13^2, etc
n_interaction = n_features

## Generate weights for the regular features randomly

In [190]:
# weights for n_features
def make_w(n_features):
    # number of features that will be over the threshold (>0.85 or <-0.85)
    n_sig = round(n_features * 3 / 10)
    # randomly select some (n_sig) weights to make significant (>0.85 or <-0.85)
    big_ws = np.random.choice([ i for i in range(n_features) ], size=n_sig, replace=False)
    ws = np.random.rand(n_features)*.2-.1 # set all weights to between -.1 to .1
    # Update the randomly selected significant weights
    for i in big_ws:
        ws[i] = np.random.rand()*0.15 + (1-0.15) # set weight to be between 0.85 and 1.0
        ws[i] *= 2*int(np.random.rand()<0.5)-1 # randomly set positive or negative
        print(i, ws[i])
    return ws

w_features = make_w(n_features)
w_features

9 -0.9821434357870202
1 0.8626903021715379
4 -0.8598401163741036


array([-0.00284605,  0.8626903 ,  0.02303846, -0.02569975, -0.85984012,
       -0.03635468,  0.02935379, -0.00274412, -0.03282401, -0.98214344])

## Generate grid of interaction term weights randomly

In [191]:
# interaction terms
def generate_pair():
    # Returns a tuple of two feature numbers representing an interaction term
    return tuple((np.random.rand(2)*n_features // 1).astype(int))

def make_interaction_w(n_features):
    # Generates the grid of weights for the interaction terms
    grid = np.zeros((n_features, n_features))
    all_terms = []
    for i in range(n_interaction):
        # select two random features to multiply together, using while to avoid duplicates
        terms = generate_pair()
        while terms in all_terms: # ensure no repeated interaction pairs
            terms = generate_pair()
        weight = np.random.rand()*0.15 + (1-0.15) # set weight to be between 0.85 and 1.0
        weight *= 2*int(np.random.rand()<0.5)-1 # randomly set positive or negative
        grid[terms[0], terms[1]] = weight
        all_terms.append(terms)
    return grid, all_terms

w_interaction, terms = make_interaction_w(n_features)
w_interaction, terms

(array([[ 0.        ,  0.9951148 ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         -0.9963527 ,  0.        ,  0.        ,  0.9107665 ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        , -0.89809864,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.89975285,
          0.        , -0.93425992,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        , -0.91068386,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.93528001,  0.       

In [211]:

def calculate_y(X):
    # Determine y values with linear combination of X*weights, interaction terms, and Noise
    # 5% with weird weights
    for i in range(len(X)):
        weird = np.random.rand()
        if weird > 0.95:
            X[i] = X[i] * np.random.rand(len(X[i]))
    
    # dot product X values with weights
    y1 = np.dot(X, w_features)
    # calculate interaction terms
    X_interaction = pd.DataFrame()
    wi = []
    for t in terms:
        X_interaction[t] = X[:,t[0]]*X[:,t[1]]
        wi.append(w_interaction[t[0], t[1]])
    # dot product interaction terms with interaction weights
    y2 = np.dot(np.array(X_interaction), wi)
    y_reg = y1 + y2 + np.random.normal(size=n_rows)
    return y_reg

X = np.random.rand(n_rows, n_features) * 100 - 50
y = calculate_y(X)
y

array([-1224.41620621,  3043.38865309, -3855.86240327, -2895.06942929,
          54.25609517,   439.38019147,   557.74771139,   694.56657804,
        -238.43523164,  3904.66055009,  -626.69577378,   211.96690065,
        3723.12304261,  2943.22191239,   926.42311036])

In [193]:
def order_interaction_terms(tuple_list):
    # sort tuples
    arr = np.sort(np.array(tuple_list))
    sorted_indices = np.lexsort((arr[:, 1], arr[:, 0]))
    sorted_arr = arr[sorted_indices]
    return sorted_arr

In [208]:
a = [1,2,3,4]
a*np.random.rand(4)

array([0.18198227, 0.16031412, 1.4781069 , 3.50499146])

In [206]:
np.random.rand(5, 2)

array([[0.37851322, 0.52648315],
       [0.47419927, 0.98701902],
       [0.65154376, 0.78700465],
       [0.37036194, 0.76796817],
       [0.81642972, 0.45778306]])

In [194]:
x = np.array([[1,2],
              [3,4],
              [5,6]])
w = np.array([10,100])
np.dot(x, w)
x[:,0]*x[:,1]

array([ 2, 12, 30])

In [195]:
# arr = np.sort(np.array([(0, 6),
#   (8, 5),
#   (8, 2),
#   (7, 1),
#   (5, 2),
#   (1, 4),
#   (6, 5),
#   (4, 7),
#   (8, 9),
#   (2, 1)]))

# # sort the array by the first column and then the second column
# sorted_indices = np.lexsort((arr[:, 1], arr[:, 0]))
# sorted_arr = arr[sorted_indices]

# print(sorted_arr)
print(order_interaction_terms(terms))

[[0 1]
 [1 5]
 [1 8]
 [1 8]
 [2 6]
 [2 7]
 [3 4]
 [3 6]
 [4 8]
 [5 6]]
