In [1]:
import numpy as np
import scipy.stats as stats
import pickle
import os

In [2]:
"""Independent Predictor Simulations"""

'Independent Predictor Simulations'

In [3]:
# Generate a design matrix X with n = 100 iid examples and p = 9 predictors from a MVN_p(0,I) distribution

In [4]:
n1 = 100
p1 = 9

In [5]:
zeromean = np.zeros(p1)
covI = np.diag(np.ones(p1))

# Use seed 42 to generate X
X = stats.multivariate_normal.rvs(mean=zeromean,cov=covI,size=n1,random_state=42)
X.shape

(100, 9)

In [6]:
# Now generate the response y.
# Assume y = XB with true Beta = [0.8,1.5,0,...,0]
# Use seed 100 to generate the regression errors

true_beta = np.reshape(np.concatenate(([0.8,1.5],np.zeros(p1-2))),(-1,1))
random_errors = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n1),cov=np.diag(np.ones(n1)),random_state=100), (-1,1))
y = np.matmul(X,true_beta) + random_errors

In [7]:
# Now need to save X and y
filename = "Data/training_data100x9.pickle" # Aim to save the data in this file

try:
    os.makedirs(os.path.dirname(filename)) # Create the Data Directory...
except FileExistsError:
    pass # Unless it already exists, in which case, do nothing

with open(filename,"wb") as file:
    pickle.dump([X,y],file) # Dump the training data into the aforementioned file

In [8]:
# Generate 50 different datasets X with 100 samples from a MVN(0,I) dataset with p=9 parameters
# Use random seeds 1,2,...,50
n2 = 100
p2 = 9
num_datasets = 50

datasets = np.zeros((num_datasets,n2,p2))
for batchidx in range(num_datasets):
    datasets[batchidx] = stats.multivariate_normal.rvs(mean=np.zeros(p2),cov=np.diag(np.ones(p2)),random_state = batchidx+1,size=n2) 

In [9]:
# For each dataset, generate random errors from a MVN(0,1) distribution of dimension n2
# Use random seeds 1001,...,1050
random_errors = np.zeros((num_datasets,n2,1))
for batchidx in range(num_datasets):
    random_errors[batchidx] = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n2),cov=np.diag(np.ones(n2)),random_state=1001+batchidx),(n2,1))

In [10]:
# Define true parameter beta = [0.8,1.5,0,...,0]
trueBeta = np.reshape(np.concatenate(([0.8,1.5],np.zeros(p2-2)),axis=0),(-1,1))
response = np.matmul(datasets,trueBeta) + random_errors
print(response.shape, datasets.shape)

(50, 100, 1) (50, 100, 9)


In [11]:
# I now need to save the datasets and responses in a numpy file
filename = "Data/GeneratedSets50x100x9.pickle"

try:
    os.makedirs(os.path.dirname(filename))
except FileExistsError:
    pass

with open(filename,"wb") as file:
    pickle.dump([datasets,response],file)

In [12]:
""" Independent predictors with less examples, but n > p"""

' Independent predictors with less examples, but n > p'

In [13]:
# Generate a design matrix X with n = 15 iid examples and p = 9 predictors from a MVN_p(0,I) distribution
n3 = 15
p3 = 9

zeromean = np.zeros(p3)
covI = np.diag(np.ones(p3))

# Use seed 43 to generate X
X = stats.multivariate_normal.rvs(mean=zeromean,cov=covI,size=n3,random_state=43)
X.shape

(15, 9)

In [14]:
# Now generate the response y.
# Assume y = XB with true Beta = [0.8,1.5,0,...,0]
# Use seed 101 to generate the regression errors

true_beta = np.reshape(np.concatenate(([0.8,1.5],np.zeros(p3-2))),(-1,1))
random_errors = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n3),cov=np.diag(np.ones(n3)),random_state=101), (-1,1))
y = np.matmul(X,true_beta) + random_errors

In [15]:
# Now need to save X and y
filename = "Data/training_data15x9.pickle" # Aim to save the data in this file

try:
    os.makedirs(os.path.dirname(filename)) # Create the Data Directory...
except FileExistsError:
    pass # Unless it already exists, in which case, do nothing

with open(filename,"wb") as file:
    pickle.dump([X,y],file) # Dump the training data into the aforementioned file

In [16]:
print(X.shape,y.shape)

(15, 9) (15, 1)


In [17]:
# Generate 25 different datasets X with 15 examples from a MVN(0,I) dataset with p=9 parameters
# Use random seeds 101,102,...,125
n4 = 15
p4 = 9
num_datasets = 25

datasets = np.zeros((num_datasets,n4,p4))
for batchidx in range(num_datasets):
    datasets[batchidx] = stats.multivariate_normal.rvs(mean=np.zeros(p4),cov=np.diag(np.ones(p4)),random_state = batchidx+101,size=n4) 

In [18]:
# For each dataset, generate random errors from a MVN(0,1) distribution of dimension n2
# Use random seeds 2001,...,2025
random_errors = np.zeros((num_datasets,n4,1))
for batchidx in range(num_datasets):
    random_errors[batchidx] = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n4),cov=np.diag(np.ones(n4)),random_state=2001+batchidx),(n4,1))

In [19]:
# Define true parameter beta = [0.8,1.5,0,...,0]
trueBeta = np.reshape(np.concatenate(([0.8,1.5],np.zeros(p4-2)),axis=0),(-1,1))
response = np.matmul(datasets,trueBeta) + random_errors
print(response.shape, datasets.shape)

(25, 15, 1) (25, 15, 9)


In [20]:
# I now need to save the datasets and responses in a numpy file
filename = "Data/GeneratedSets25x15x9.pickle"

try:
    os.makedirs(os.path.dirname(filename))
except FileExistsError:
    pass

with open(filename,"wb") as file:
    pickle.dump([datasets,response],file)

In [44]:
"""Dependent Case with Multicollinearity"""

'Dependent Case with Multicollinearity'

In [45]:
# Generate a design matrix X with n = 70 iid examples and p = 6 predictors distribution
# Suppose the first 2 predictors are independent, X3 = 4X1-2X2, X4 independent of X1,X2,X3 , X5 = 3X3 -6X4, X6 independent
n5 = 70
p5 = 6

In [46]:
zeromean = np.zeros(4)
covI = np.diag(np.ones(4))

# Use seed 44 to generate X_1,X_2,X_4,X_6
X1246 = stats.multivariate_normal.rvs(mean=zeromean,cov=covI,size=n5,random_state=44)
X3 = np.reshape(4*X1246[:,0]-2*X1246[:,1],(-1,1))+np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n5),cov=np.diag(np.ones(n5)),random_state=1),(-1,1))
X5 = 3*X3 - np.reshape(6*X1246[:,2],(-1,1)) + np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n5),cov=np.diag(np.ones(n5)),random_state=2),(-1,1))
X = np.concatenate((np.reshape(X1246[:,:2],(n5,2)),X3,np.reshape(X1246[:,2],(-1,1)),X5,np.reshape(X1246[:,3],(-1,1))),axis=1)

In [47]:
# Now generate the response y.
# Assume y = XB with true Beta = [0.8,-1.5,0.1,0,...,0]
# Use seed 102 to generate the regression errors

true_beta = np.reshape(np.concatenate(([0.8,-1.5,0.1],np.zeros(p5-3))),(-1,1))
random_errors = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n5),cov=np.diag(np.ones(n5)),random_state=102), (-1,1))
y = np.matmul(X,true_beta) + random_errors

In [48]:
# Now need to save X and y
filename = "Data/training_data_Correlated70x6.pickle" # Aim to save the data in this file

try:
    os.makedirs(os.path.dirname(filename)) # Create the Data Directory...
except FileExistsError:
    pass # Unless it already exists, in which case, do nothing

with open(filename,"wb") as file:
    pickle.dump([X,y],file) # Dump the training data into the aforementioned file

In [49]:
print(X.shape, y.shape)

(70, 6) (70, 1)


In [26]:
# Generate 50 different datasets X with 70 examples and p=6 parameters
# X3 = 4X1-2X2 + eps, X5 = 3X3 -6X4 + eps, X6 independent
# Use random seeds 201,202,...,225 to generate X1,X2,X4,X6 from MVN(0,I) 70x4 matrix
n6 = 70
p6 = 6
num_datasets = 50

datasets = np.zeros((num_datasets,n6,p6))
for batchidx in range(num_datasets):
    X1246 = stats.multivariate_normal.rvs(mean=np.zeros(4),cov=np.diag(np.ones(4)),random_state=201+batchidx,size=n6)
    X3 = np.reshape(4*X1246[:,0]-2*X1246[:,1],(-1,1))+np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n6),cov=np.diag(np.ones(n6)),random_state=1+batchidx),(-1,1))
    X5 = 3*X3 - np.reshape(6*X1246[:,2],(-1,1)) + np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n6),cov=np.diag(np.ones(n6)),random_state=2+batchidx),(-1,1))
    X = np.concatenate((np.reshape(X1246[:,:2],(n6,2)),X3,np.reshape(X1246[:,2],(-1,1)),X5,np.reshape(X1246[:,3],(-1,1))),axis=1)
    datasets[batchidx] = X

In [27]:
# For each dataset, generate random errors from a MVN(0,1) distribution of dimension n6
# Use random seeds 3001,...,3025
random_errors = np.zeros((num_datasets,n6,1))
for batchidx in range(num_datasets):
    random_errors[batchidx] = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n6),cov=np.diag(np.ones(n6)),random_state=3001+batchidx),(n6,1))


In [28]:
# Define true parameter beta = [0.8,-1.5,0.1,0,...,0]
trueBeta = np.reshape(np.concatenate(([0.8,-1.5,0.1],np.zeros(p6-3)),axis=0),(-1,1))
response = np.matmul(datasets,trueBeta) + random_errors
print(response.shape, datasets.shape)

(50, 70, 1) (50, 70, 6)


In [29]:
# I now need to save the datasets and responses in a numpy file
filename = "Data/CorrelatedSets50x70x6.pickle"

try:
    os.makedirs(os.path.dirname(filename))
except FileExistsError:
    pass

with open(filename,"wb") as file:
    pickle.dump([datasets,response],file)

In [52]:
n7 = 13
p7 = 16
# Try for 14 true non zero beta components

In [53]:
zeromean = np.zeros(p7)
covI = np.diag(np.ones(p7))

# Use seed 42 to generate X
X = stats.multivariate_normal.rvs(mean=zeromean,cov=covI,size=n7,random_state=45)
X.shape

(13, 16)

In [55]:
# Now generate the response y.
# Assume y = XB with true Beta generated from MVN(10,4Ip)
# Use seed 100 to generate the regression errors

nonzerobeta = stats.multivariate_normal.rvs(10*np.ones(14),9*np.diag(np.ones(14)),random_state=100000)
nonzerobeta
true_beta = np.reshape(np.concatenate((nonzerobeta,np.zeros(2))),(-1,1))
random_errors = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n7),cov=np.diag(np.ones(n7)),random_state=100), (-1,1))
y = np.matmul(X,true_beta) + random_errors

In [56]:
# Now need to save X and y
filename = "Data/training_data13x16.pickle" # Aim to save the data in this file

try:
    os.makedirs(os.path.dirname(filename)) # Create the Data Directory...
except FileExistsError:
    pass # Unless it already exists, in which case, do nothing

with open(filename,"wb") as file:
    pickle.dump([X,y],file) # Dump the training data into the aforementioned file

In [57]:
X.shape, y.shape

((13, 16), (13, 1))

In [58]:
# Generate 50 different datasets X with 13 samples from a MVN(0,I) dataset with p=16 parameters
n8 = 13
p8 = 16
num_datasets = 50

datasets = np.zeros((num_datasets,n8,p8))
for batchidx in range(num_datasets):
    datasets[batchidx] = stats.multivariate_normal.rvs(mean=np.zeros(p8),cov=np.diag(np.ones(p8)),random_state = batchidx+301,size=n8) 

In [59]:
# Use random seeds 1001,...,1050
random_errors = np.zeros((num_datasets,n8,1))
for batchidx in range(num_datasets):
    random_errors[batchidx] = np.reshape(stats.multivariate_normal.rvs(mean=np.zeros(n8),cov=np.diag(np.ones(n8)),random_state=4001+batchidx),(n8,1))

In [61]:
nonzerobeta = stats.multivariate_normal.rvs(10*np.ones(14),9*np.diag(np.ones(14)),random_state=100001)
true_beta = np.reshape(np.concatenate((nonzerobeta,np.zeros(2))),(-1,1))
true_beta

array([[ 8.88528887],
       [ 8.97889158],
       [10.63234983],
       [10.14569967],
       [12.87414851],
       [ 8.0020032 ],
       [ 9.90012985],
       [13.13669927],
       [12.28933273],
       [11.5908425 ],
       [11.2033932 ],
       [ 7.08207155],
       [ 5.86457569],
       [14.50343311],
       [ 0.        ],
       [ 0.        ]])

In [63]:
response = np.matmul(datasets,true_beta) + random_errors
print(response.shape, datasets.shape)

(50, 13, 1) (50, 13, 16)


In [64]:
# I now need to save the datasets and responses in a numpy file
filename = "Data/GeneratedSets50x13x16.pickle"

try:
    os.makedirs(os.path.dirname(filename))
except FileExistsError:
    pass

with open(filename,"wb") as file:
    pickle.dump([datasets,response],file)