In [1]:
import numpy as np
import os

This notebook is designed to process the BBQ data and performs the following tasks:
1. Reads in separate phenotype measurements files and puts them in one array
2. Splits genotype and phenotypes arrays into train/test/validation and saves them
3. Computes correlation coeff for each loci and and its neighbors (based only on the genotype training set)

In [2]:
#environments (phenotypes)
envs=["ynb","suloc","raff","mol","27C","eth","30C","25C","sds","cu","33C","li","gu","23C","35C","mann","37C", "4NQO"]
envs= sorted(envs)

# find phenotype input files; should be in form <path_to_files>_<env>_<suffix>
pheno_path = "./BBQ_data/pheno_data_"
file_suffix = ".txt"
input_geno_file = "./BBQ_data/geno_data_bool_99950.npy"

# specify output directory. create directory if it does not exist. 
output_dir = "./BBQ_data_processed"
try:
    os.mkdir(output_dir)
except:
    print(f"already have directory {output_dir}")

#truncate data to first N strains
truncate = False
N = 99950

#train/validation/test fractions
train_frac = 0.80
validation_frac = 0.10
test_frac = 0.10
seed = 0
if train_frac+validation_frac+test_frac != 1.0:
    raise ValueError("fractions for train/validation/test must sum to one")

# compute correlation coeff for each position and max_loci_cc positions in front
max_loci_cc = 200 # compute correlation coeff for each position and max_loci_cc positions in front

already have directory ./BBQ_data_processed


## Load genotype and phenotype files,  make one array from phenotypes

In [3]:
def get_pheno_from_file(filename,N): 
    Y = np.genfromtxt(filename,skip_header = 1, skip_footer = 1, usecols = (1))
    Y = Y[:N]
    return Y

In [4]:
P = np.zeros((N,len(envs)))
for i, phen in enumerate(envs):
    filename = f"{pheno_path}{phen}.txt"
    P[:,i]= get_pheno_from_file(filename,N)
print("phenotype shape")
print(P.shape)

phenotype shape
(99950, 18)


In [5]:
G = np.load(input_geno_file)
print("genotype shape")
print(G.shape)

genotype shape
(99950, 41594)


## Load genotype file, divide geno + pheno into train/val/test

In [6]:
# filter out segregants with NaN fitness
filter_out_nan = P==P
filter_out_nan = np.sum(filter_out_nan, axis = -1 ) == len(envs)
print(f"filtering out {P.shape[0]-sum(filter_out_nan)} segregants due to a nan value for some phenotype") 
G = G[filter_out_nan,:]
P = P[filter_out_nan,:]
print("phenotype shape")
print(P.shape)
print("genotype shape")
print(G.shape)

filtering out 8206 segregants due to a nan value for some phenotype
phenotype shape
(91744, 18)
genotype shape
(91744, 41594)


In [7]:
# split data into train/validation/test
np.random.seed(seed)
X = G
Y = P

filt = np.random.choice(len(X), size = int(train_frac*len(X)), replace = False)
split = np.zeros(len(X),dtype=bool)
split[filt] = 1
Xtrain = X[split]
Ytrain = Y[split]
Xval = X[~split]
Yval = Y[~split]

vfilt = np.random.choice(len(Xval), size = int(validation_frac/(validation_frac + test_frac) *len(Xval)), replace = False)
vsplit = np.zeros(len(Xval),dtype=bool)
vsplit[vfilt] = 1
Xtest = Xval[vsplit]
Ytest = Yval[vsplit]
Xval = Xval[~vsplit]
Yval = Yval[~vsplit]

print(f"training sizes: {Xtrain.shape}, {Ytrain.shape}")
print(f"validation sizes: {Xval.shape}, {Yval.shape}")
print(f"test sizes: {Xtest.shape}, {Ytest.shape}")

training sizes: (73395, 41594), (73395, 18)
validation sizes: (9175, 41594), (9175, 18)
test sizes: (9174, 41594), (9174, 18)


In [8]:
# save results
np.save(open(f"{output_dir}/geno_train.npy", "wb"),Xtrain)
np.save(open(f"{output_dir}/pheno_train.npy", "wb"),Ytrain)
np.save(open(f"{output_dir}/geno_val.npy", "wb"),Xval)
np.save(open(f"{output_dir}/pheno_val.npy", "wb"),Yval)
np.save(open(f"{output_dir}/geno_test.npy", "wb"),Xtest)
np.save(open(f"{output_dir}/pheno_test.npy", "wb"),Ytest)


## Compute pairwise correlations for each loci and max_loci_cc neighbors (slow)

In [9]:
g = Xtrain

In [10]:
g = 2*g-1
stdg = []
l=5000
for _ in range(int(g.shape[1]/l)+1):
    print(_*l,(_+1)*l)
    stdg += list(np.std(g[:,_*l:(_+1)*l], axis = 0))
stdg= np.array(stdg)

0 5000
5000 10000
10000 15000
15000 20000
20000 25000
25000 30000
30000 35000
35000 40000
40000 45000


In [11]:
mg = np.mean(g, axis = 0)
ng = g-mg
ostdg=1/stdg
for _ in range(int(g.shape[1]/l)+1):
    ng[:,_*l:(_+1)*l] = ng[:,_*l:(_+1)*l]*ostdg[_*l:(_+1)*l]

In [12]:
ml = max_loci_cc
cc = np.zeros((g.shape[1], ml))
for _ in range(g.shape[1]):
    if _%1000==0: print(_)
    ccs=np.einsum('j,jk->k', ng[:,_], ng[:, _+1:_+ml+1])/ng.shape[0]
    cc[_,0:int(ccs.shape[0])]=ccs
print("correlation shape")
print(cc.shape)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
correlation shape
(41594, 200)


In [13]:
np.save(open(f"{output_dir}/cc_data_all.npy", "wb"), cc)
print(f"saved array at {output_dir}/cc_data_all.npy")

saved array at ./BBQ_data_processed/cc_data_all.npy
