In [1]:
import numpy as np
import pandas as pd

In [2]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

-------

#

In [3]:
outdir = './data/chen-2023-preprocessed-data/'

-------

# Process Haploid Bulk Fitness Assay (hBFA) data

In [4]:
rawDataDir = './data/chen-2023-elife-data/'

#### Load & organize raw data

##### Load the raw Chen et al. (2023) barcode counts data table

(rows: variants; cols: barcode counts for each experimental treatment/rep/timept and variant metadata)

In [5]:
rawCountsTable = pd.read_csv(rawDataDir+'hBFA1_counts_with_env_info.csv')

#### Extract the counts data (excluding metadata) from the variant and ancestor datasets

In [6]:
metadata_columns = ['Full.BC', 'Diverse.BC', 'Environment.BC', 'Total.Counts', 'Subpool.Environment', 'Which.Subpools', 'Putative.Environment']

In [7]:
rawCounts = rawCountsTable.drop(columns=metadata_columns)
rawCounts

Unnamed: 0,hBFA1-21C-R1-Time8,hBFA1-21C-R1-Time16,hBFA1-21C-R1-Time24,hBFA1-21C-R1-Time40,hBFA1-21C-R2-Time8,hBFA1-21C-R2-Time16,hBFA1-21C-R2-Time24,hBFA1-21C-R2-Time40,hBFA1-37C-R1-Time8,hBFA1-37C-R1-Time16,hBFA1-37C-R1-Time24,hBFA1-37C-R1-Time40,hBFA1-37C-R2-Time8,hBFA1-37C-R2-Time16,hBFA1-37C-R2-Time24,hBFA1-37C-R2-Time40,hBFA1-48Hr-R1-Time8,hBFA1-48Hr-R1-Time16,hBFA1-48Hr-R1-Time24,hBFA1-48Hr-R1-Time40,hBFA1-48Hr-R2-Time8,hBFA1-48Hr-R2-Time16,hBFA1-48Hr-R2-Time24,hBFA1-48Hr-R2-Time40,hBFA1-FLC4-R2-Time8,hBFA1-FLC4-R2-Time16,hBFA1-FLC4-R2-Time24,hBFA1-FLC4-R2-Time40,hBFA1-GlyEtOH-R1-Time8,hBFA1-GlyEtOH-R1-Time16,hBFA1-GlyEtOH-R1-Time24,hBFA1-GlyEtOH-R1-Time40,hBFA1-GlyEtOH-R2-Time8,hBFA1-GlyEtOH-R2-Time16,hBFA1-GlyEtOH-R2-Time24,hBFA1-GlyEtOH-R2-Time40,hBFA1-SC-R1-Time8,hBFA1-SC-R1-Time16,hBFA1-SC-R1-Time24,hBFA1-SC-R1-Time40,hBFA1-SC-R2-Time8,hBFA1-SC-R2-Time16,hBFA1-SC-R2-Time24,hBFA1-SC-R2-Time40,hBFA1-YPD-R1-Time8,hBFA1-YPD-R1-Time16,hBFA1-YPD-R1-Time24,hBFA1-YPD-R1-Time40,hBFA1-YPD-R2-Time8,hBFA1-YPD-R2-Time16,hBFA1-YPD-R2-Time24,hBFA1-YPD-R2-Time40,hBFA1-pH3_8-R1-Time8,hBFA1-pH3_8-R1-Time16,hBFA1-pH3_8-R1-Time24,hBFA1-pH3_8-R1-Time40,hBFA1-pH3_8-R2-Time8,hBFA1-pH3_8-R2-Time16,hBFA1-pH3_8-R2-Time24,hBFA1-pH3_8-R2-Time40,hBFA1-pH7_3-R1-Time8,hBFA1-pH7_3-R1-Time16,hBFA1-pH7_3-R1-Time24,hBFA1-pH7_3-R1-Time40,hBFA1-pH7_3-R2-Time8,hBFA1-pH7_3-R2-Time16,hBFA1-pH7_3-R2-Time24,hBFA1-pH7_3-R2-Time40
0,18075,34976,73925,379540,19395,33224,67414,473720,36413,72215,120048,323260,35264,76376,128764,395642,58079,106586,153886,503009,90272,138272,185857,392937,8826,690,2310,1302,64526,101861,89724,214115,78087,225592,286933,220946,17610,35231,67572,200094,23871,33046,57404,199577,27936,81462,285233,414060,35589,93249,152324,443061,39397,85853,153919,323697,23365,93624,172839,440061,41107,58049,118324,236402,42332,71938,96247,239779
1,109316,117037,102934,81035,122720,113384,95658,83279,140343,131053,103226,65894,161542,152263,127606,76375,81672,92562,84147,92839,136639,119851,95640,78356,29924,5794,8136,2655,56206,30550,14705,4073,61333,67183,32110,4239,106152,94024,87216,58286,140094,108014,93791,66511,79590,92123,131488,41442,104783,94953,74508,47270,146785,130842,119500,67517,95990,145530,134678,89493,204961,166806,184853,154540,198665,187896,164838,152973
2,6539,4508,4751,3545,7334,4549,4273,3511,7771,6269,9488,11685,12360,7577,5588,13405,5796,4743,18323,22243,12088,1461,18113,13966,3328,204,1385,1180,45206,135292,147221,699111,45299,251626,357161,639549,8667,11719,17075,37643,5744,10672,18433,39563,9170,20739,61359,63627,10914,23059,31036,64681,11494,15972,30293,58740,14528,17241,33489,79132,13665,13724,15892,19474,13758,17580,19743,23223
3,31099,42058,42356,38478,39228,41195,42935,51296,32634,23369,18619,7488,40855,28633,17674,7408,34917,56809,34105,67315,50951,64354,41741,33191,40379,15074,39713,72953,19617,9119,979,1747,20106,25193,11424,1820,23195,22156,21883,18536,28218,26482,27479,23968,18192,26502,43783,15863,23146,28682,23235,21635,24144,31572,35862,35065,28131,34743,44411,42641,35504,28900,33550,29448,39531,37391,32982,28658
4,4250,7508,13340,61474,4878,7708,13778,94633,6576,14705,29220,76955,7438,15378,20278,80289,10741,25236,30171,100566,18520,25521,36808,66914,1721,74,431,258,17158,25083,11029,58151,18737,58559,68481,59882,3482,6305,11151,32390,4746,6657,11499,37696,6188,17762,58020,85510,7344,19560,30739,94122,6019,15079,29731,78756,5652,16425,36098,105253,8471,11161,22970,52473,8942,15606,21789,53481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2581,1,2,4,2,5,6,7,9,7,2,3,2,4,7,3,6,5,7,3,7,4,5,3,5,0,0,0,0,1,1,2,1,3,2,0,0,3,7,6,6,5,4,5,7,1,4,8,7,6,4,2,4,4,5,5,2,2,6,3,3,6,3,3,3,3,7,8,6
2582,3,3,5,1,7,5,5,5,4,0,4,3,2,3,5,4,3,1,5,3,8,11,4,4,0,1,0,0,3,3,0,1,4,1,2,0,6,1,4,1,2,8,6,3,4,5,6,3,5,3,3,4,5,2,3,0,0,5,4,2,4,7,10,3,7,4,6,4
2583,3,5,4,2,3,7,6,3,1,0,3,3,6,4,3,1,2,7,3,6,7,4,5,4,0,0,1,1,5,2,0,0,2,1,0,0,2,3,6,2,1,3,7,3,3,5,6,4,4,5,3,2,0,1,6,6,1,3,6,3,3,4,4,1,3,4,5,1
2584,5,4,4,2,8,6,2,13,0,0,4,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,10,7,0,0,0,2,0,0,0,0,0,0,1,0,0,0,12,7,6,2,10,7,5,1,0,0,0,0,18,0,0,0,0,0,0,0,1,0,0,0


#### Extract the metadata for variants from the variants dataset

In [8]:
variantsInfo = rawCountsTable[metadata_columns].copy()  # this .copy() prevents variantsInfo from being seen as a slice internally
variantsInfo = variantsInfo.rename(columns={'Full.BC':'barcode_full', 'Diverse.BC':'barcode_diverse', 'Environment.BC':'barcode_environment', 'Total.Counts':'total_counts',
                                            'Subpool.Environment':'subpool_environment', 'Which.Subpools':'which_subpools', 'Putative.Environment':'putative_environment'})
variantsInfo

Unnamed: 0,barcode_full,barcode_diverse,barcode_environment,total_counts,subpool_environment,which_subpools,putative_environment
0,ATAAAAAAGCACAAGCCTTTTGACGACACCTAAATTAGTTATCCAT...,ATAAAAAAGCACAAGCCTTTTGACGA,CACCTAAATTAGTTATCCATTCGGCT,9616313,not_read,not_read,SC_2N
1,CGATAAACCCACAATATGATTATCGGTACACAAGAGGGTTTCTGTT...,CGATAAACCCACAATATGATTATCGG,TACACAAGAGGGTTTCTGTTTTATGG,6648391,not_read,not_read,SC_alpha
2,CGCACAAGAAAGAATAATCTTGAATTGGTAAAACCAGATTTGGCAT...,CGCACAAGAAAGAATAATCTTGAATT,GGTAAAACCAGATTTGGCATTCACTA,3368497,GlyEtOH_alpha,-R1-Subpool.-R1-autodiploids,GlyEtOH_alpha
3,ACAAAAAGATATAACAAGCTTGAAGACCCGAAAAAGTTTTTTATCT...,ACAAAAAGATATAACAAGCTTGAAGA,CCCGAAAAAGTTTTTTATCTTCAATG,2082320,FLC4_alpha,-R1-Subpool,FLC4_alpha
4,AATAAAAGAAGGAAAAGCATTTAAACAAACAAACTTTCTTTTTTCT...,AATAAAAGAAGGAAAAGCATTTAAAC,AAACAAACTTTCTTTTTTCTTTATGC,2013526,YPD_alpha,-R1-Subpool,contam-pH3_8_2N
...,...,...,...,...,...,...,...
2581,CACGCAAAACTTTATTCAATAACTTCGTCCGCCAATCATATTGTAC...,CACGCAAAACTTTATTCAATAACTTCGT,CCGCCAATCATATTGTACTTTCAAGG,267,not_read,not_read,SC_alpha
2582,CTATGAAAAGACAACAAGATTGCCTACTATTAACTGATTTTTCTCT...,CTATGAAAAGACAACAAGATTGCCTA,CTATTAACTGATTTTTCTCTTTCGAA,243,pH7_3_alpha,-R2-Subpool,sanger-pH7_3_alpha
2583,CGTTAGAATTCTAAACCTGGTTCAACTATTGGAACATCTTTAGTAT...,CGTTAGAATTCTAAACCTGGTTCAACT,ATTGGAACATCTTTAGTATTTAATAC,214,37C_alpha,-R1-Subpool,unknown
2584,CCGTCAATCATTAACTTAGTTTCTTACCAGAAATCATATTTTTATT...,CCGTCAATCATTAACTTAGTTTCTTA,CCAGAAATCATATTTTTATTTATTAA,139,21C_alpha,-R1-Subpool,21C_alpha


#### Parse treatment, rep, and timepoint information from sample labels (column names)

In [9]:
sample_columns_raw = rawCounts.columns
sample_columns_raw.values

array(['hBFA1-21C-R1-Time8', 'hBFA1-21C-R1-Time16', 'hBFA1-21C-R1-Time24',
       'hBFA1-21C-R1-Time40', 'hBFA1-21C-R2-Time8', 'hBFA1-21C-R2-Time16',
       'hBFA1-21C-R2-Time24', 'hBFA1-21C-R2-Time40', 'hBFA1-37C-R1-Time8',
       'hBFA1-37C-R1-Time16', 'hBFA1-37C-R1-Time24',
       'hBFA1-37C-R1-Time40', 'hBFA1-37C-R2-Time8', 'hBFA1-37C-R2-Time16',
       'hBFA1-37C-R2-Time24', 'hBFA1-37C-R2-Time40',
       'hBFA1-48Hr-R1-Time8', 'hBFA1-48Hr-R1-Time16',
       'hBFA1-48Hr-R1-Time24', 'hBFA1-48Hr-R1-Time40',
       'hBFA1-48Hr-R2-Time8', 'hBFA1-48Hr-R2-Time16',
       'hBFA1-48Hr-R2-Time24', 'hBFA1-48Hr-R2-Time40',
       'hBFA1-FLC4-R2-Time8', 'hBFA1-FLC4-R2-Time16',
       'hBFA1-FLC4-R2-Time24', 'hBFA1-FLC4-R2-Time40',
       'hBFA1-GlyEtOH-R1-Time8', 'hBFA1-GlyEtOH-R1-Time16',
       'hBFA1-GlyEtOH-R1-Time24', 'hBFA1-GlyEtOH-R1-Time40',
       'hBFA1-GlyEtOH-R2-Time8', 'hBFA1-GlyEtOH-R2-Time16',
       'hBFA1-GlyEtOH-R2-Time24', 'hBFA1-GlyEtOH-R2-Time40',
       'hBFA1-SC-R1-Time8

In [10]:
samplesInfo = []

sample_renames = {}

for sample in rawCounts.columns:
    sample_info = {}
    sample_info['chen_sample_label'] = sample
    sample_label_parts = sample.split('-')
    sample_info['assay_set'] = sample_label_parts[0]
    sample_info['treatment'] = '.'.join(sample_label_parts[1].split('_'))
    sample_info['rep']       = sample_label_parts[2]
    sample_info['time']    = sample_label_parts[3][4:]
    sample_info['assay']     = sample_info['assay_set'] + ':' + sample_info['treatment'] + '-' + sample_info['rep']
    sample_info['sample']    = sample_info['assay'] + '-T' + sample_info['time']
    sample_info['total_raw_count'] = rawCounts[sample].sum()
    sample_renames[sample] = sample_info['sample']
    samplesInfo.append(sample_info)
    
samplesInfo = pd.DataFrame(samplesInfo)

rawCounts = rawCounts.rename(columns=sample_renames)

In [11]:
samplesInfo['time'] = pd.to_numeric(samplesInfo['time'])

In [12]:
samplesInfo['time'] = samplesInfo['time'] / 8

In [13]:
# Sort dataframe for convenience:
samplesInfo.sort_values(by=['assay_set', 'treatment', 'rep', 'time'], ascending=[True, True, True, True], ignore_index=True, inplace=True)

In [14]:
# Reorder columns for cosmetic reasons
samplesInfo = samplesInfo[['chen_sample_label', 'sample', 'assay', 'assay_set', 'treatment', 'rep', 'time', 'total_raw_count']]

In [15]:
samplesInfo

Unnamed: 0,chen_sample_label,sample,assay,assay_set,treatment,rep,time,total_raw_count
0,hBFA1-21C-R1-Time8,hBFA1:21C-R1-T8,hBFA1:21C-R1,hBFA1,21C,R1,1.0,1917150
1,hBFA1-21C-R1-Time16,hBFA1:21C-R1-T16,hBFA1:21C-R1,hBFA1,21C,R1,2.0,2168035
2,hBFA1-21C-R1-Time24,hBFA1:21C-R1-T24,hBFA1:21C-R1,hBFA1,21C,R1,3.0,2068099
3,hBFA1-21C-R1-Time40,hBFA1:21C-R1-T40,hBFA1:21C-R1,hBFA1,21C,R1,5.0,2118452
4,hBFA1-21C-R2-Time8,hBFA1:21C-R2-T8,hBFA1:21C-R2,hBFA1,21C,R2,1.0,2111700
...,...,...,...,...,...,...,...,...
63,hBFA1-pH7_3-R1-Time40,hBFA1:pH7.3-R1-T40,hBFA1:pH7.3-R1,hBFA1,pH7.3,R1,5.0,2719186
64,hBFA1-pH7_3-R2-Time8,hBFA1:pH7.3-R2-T8,hBFA1:pH7.3-R2,hBFA1,pH7.3,R2,1.0,2757245
65,hBFA1-pH7_3-R2-Time16,hBFA1:pH7.3-R2-T16,hBFA1:pH7.3-R2,hBFA1,pH7.3,R2,2.0,2680456
66,hBFA1-pH7_3-R2-Time24,hBFA1:pH7.3-R2-T24,hBFA1:pH7.3-R2,hBFA1,pH7.3,R2,3.0,2585981


----------
#### Filter samples by total counts

In [16]:
# Create a column in samplesInfo to indicate which samples are excluded from analysis.
samplesInfo['exclude_from_analysis'] = False

In [17]:
MIN_SAMPLE_COUNT_TOTAL = 1e5

In [18]:
# Exclude treatments that have one or more samples with total count below threshold
good_coverage_samples = []
for treatment in samplesInfo['treatment'].unique():
    treatmentInfo = samplesInfo[samplesInfo['treatment'] == treatment]
    if(np.all((treatmentInfo['time'] == 0) | (treatmentInfo['total_raw_count'] >= MIN_SAMPLE_COUNT_TOTAL))):
        good_coverage_samples.extend( treatmentInfo['sample'].values )
    else:
        samplesInfo.loc[samplesInfo['treatment'] == treatment, 'exclude_from_analysis'] = True

In [19]:
rawCounts   = rawCounts[good_coverage_samples]

----------
#### Manually exclude assays/samples

In [20]:
samplesInfo.loc[samplesInfo['sample'] == 'hBFA1:FLC4-R2-T16', 'exclude_from_analysis'] = True     # excluded from Chen et al paper due to presumed strong GC bias effects
# samplesInfo.loc[samplesInfo['sample'] == 'hBFA1:37C-R2-T24', 'exclude_from_analysis'] = False      # excluded from Chen et al paper due to presumed strong GC bias effects
# samplesInfo.loc[samplesInfo['sample'] == 'hBFA1:pH3.8-R2-T16', 'exclude_from_analysis'] = False    # excluded from Chen et al paper due to presumed strong GC bias effects
samplesInfo.loc[samplesInfo['sample'] == 'hBFA1:GlyEtOH-R1-T24', 'exclude_from_analysis'] = True  # excluded from Chen et al paper due to presumed strong GC bias effects

In [21]:
# Drop excluded samples from the data set
samplesInfo = samplesInfo[~samplesInfo['exclude_from_analysis']]
samplesInfo = samplesInfo.drop(columns=['exclude_from_analysis'])

------

#### Get barcode sequence along with GC-content for each variant

In [22]:
variants_GCratios = []
for idx, variant_info in variantsInfo.iterrows():
    variant_barcodeSeq = variant_info['barcode_full']
    variant_barcodeGCratio = (variant_barcodeSeq.count('G') + variant_barcodeSeq.count('C'))/len(variant_barcodeSeq)
    variants_GCratios.append(variant_barcodeGCratio)
#----------------
variantsInfo['barcode_GCratio']  = variants_GCratios
variantsInfo

Unnamed: 0,barcode_full,barcode_diverse,barcode_environment,total_counts,subpool_environment,which_subpools,putative_environment,barcode_GCratio
0,ATAAAAAAGCACAAGCCTTTTGACGACACCTAAATTAGTTATCCAT...,ATAAAAAAGCACAAGCCTTTTGACGA,CACCTAAATTAGTTATCCATTCGGCT,9616313,not_read,not_read,SC_2N,0.365385
1,CGATAAACCCACAATATGATTATCGGTACACAAGAGGGTTTCTGTT...,CGATAAACCCACAATATGATTATCGG,TACACAAGAGGGTTTCTGTTTTATGG,6648391,not_read,not_read,SC_alpha,0.384615
2,CGCACAAGAAAGAATAATCTTGAATTGGTAAAACCAGATTTGGCAT...,CGCACAAGAAAGAATAATCTTGAATT,GGTAAAACCAGATTTGGCATTCACTA,3368497,GlyEtOH_alpha,-R1-Subpool.-R1-autodiploids,GlyEtOH_alpha,0.346154
3,ACAAAAAGATATAACAAGCTTGAAGACCCGAAAAAGTTTTTTATCT...,ACAAAAAGATATAACAAGCTTGAAGA,CCCGAAAAAGTTTTTTATCTTCAATG,2082320,FLC4_alpha,-R1-Subpool,FLC4_alpha,0.288462
4,AATAAAAGAAGGAAAAGCATTTAAACAAACAAACTTTCTTTTTTCT...,AATAAAAGAAGGAAAAGCATTTAAAC,AAACAAACTTTCTTTTTTCTTTATGC,2013526,YPD_alpha,-R1-Subpool,contam-pH3_8_2N,0.230769
...,...,...,...,...,...,...,...,...
2581,CACGCAAAACTTTATTCAATAACTTCGTCCGCCAATCATATTGTAC...,CACGCAAAACTTTATTCAATAACTTCGT,CCGCCAATCATATTGTACTTTCAAGG,267,not_read,not_read,SC_alpha,0.370370
2582,CTATGAAAAGACAACAAGATTGCCTACTATTAACTGATTTTTCTCT...,CTATGAAAAGACAACAAGATTGCCTA,CTATTAACTGATTTTTCTCTTTCGAA,243,pH7_3_alpha,-R2-Subpool,sanger-pH7_3_alpha,0.307692
2583,CGTTAGAATTCTAAACCTGGTTCAACTATTGGAACATCTTTAGTAT...,CGTTAGAATTCTAAACCTGGTTCAACT,ATTGGAACATCTTTAGTATTTAATAC,214,37C_alpha,-R1-Subpool,unknown,0.301887
2584,CCGTCAATCATTAACTTAGTTTCTTACCAGAAATCATATTTTTATT...,CCGTCAATCATTAACTTAGTTTCTTA,CCAGAAATCATATTTTTATTTATTAA,139,21C_alpha,-R1-Subpool,21C_alpha,0.230769


------

#### Label control set variants ("putatively neutral barcodes")

In [23]:
tableWithNeutInfo = pd.read_csv(rawDataDir+'hBFA1_all_freqs_tidy.csv')
# tableWithNeutInfo

In [24]:
putativeNeutralBarcodes = tableWithNeutInfo[tableWithNeutInfo['Putative_Neutral'] == True]['Barcode'].values
putativeNeutralBarcodeInfo = pd.DataFrame(index=range(len(putativeNeutralBarcodes)))
putativeNeutralBarcodeInfo['barcode_full']        = [bc.split('_')[0] + bc.split('_')[-1] for bc in putativeNeutralBarcodes]
putativeNeutralBarcodeInfo['barcode_diverse']     = [bc.split('_')[0] for bc in putativeNeutralBarcodes]
putativeNeutralBarcodeInfo['barcode_environment'] = [bc.split('_')[-1] for bc in putativeNeutralBarcodes]
# putativeNeutralBarcodeInfo

In [25]:
variantsInfo['neutral_group'] = ''
variantsInfo.loc[variantsInfo['barcode_full'].isin(putativeNeutralBarcodeInfo['barcode_full'].values), 'neutral_group'] = 'putative_neutral'
variantsInfo['control_set'] = False
variantsInfo.loc[variantsInfo['barcode_full'].isin(putativeNeutralBarcodeInfo['barcode_full'].values), 'control_set'] = True

# This was my way of approximating their putative neutral group before they sent me the datafile with Putative_Neutral column (mostly agreement):
# variantsInfo['neutral_group'] = ''
# variantsInfo.loc[variantsInfo['putative_environment'] == 'YPD_alpha', 'neutral_group'] = 'YPD_alpha'
# variantsInfo['control_set'] = False
# variantsInfo.loc[variantsInfo['putative_environment'] == 'YPD_alpha', 'control_set'] = True

In [26]:
print(f"{np.sum(variantsInfo['control_set'].values)}/{len(variantsInfo)}")

288/2586


#### Save processed data to file

In [27]:
samplesInfo.to_csv(outdir+'samples.csv', index=False)
samplesInfo

Unnamed: 0,chen_sample_label,sample,assay,assay_set,treatment,rep,time,total_raw_count
0,hBFA1-21C-R1-Time8,hBFA1:21C-R1-T8,hBFA1:21C-R1,hBFA1,21C,R1,1.0,1917150
1,hBFA1-21C-R1-Time16,hBFA1:21C-R1-T16,hBFA1:21C-R1,hBFA1,21C,R1,2.0,2168035
2,hBFA1-21C-R1-Time24,hBFA1:21C-R1-T24,hBFA1:21C-R1,hBFA1,21C,R1,3.0,2068099
3,hBFA1-21C-R1-Time40,hBFA1:21C-R1-T40,hBFA1:21C-R1,hBFA1,21C,R1,5.0,2118452
4,hBFA1-21C-R2-Time8,hBFA1:21C-R2-T8,hBFA1:21C-R2,hBFA1,21C,R2,1.0,2111700
...,...,...,...,...,...,...,...,...
63,hBFA1-pH7_3-R1-Time40,hBFA1:pH7.3-R1-T40,hBFA1:pH7.3-R1,hBFA1,pH7.3,R1,5.0,2719186
64,hBFA1-pH7_3-R2-Time8,hBFA1:pH7.3-R2-T8,hBFA1:pH7.3-R2,hBFA1,pH7.3,R2,1.0,2757245
65,hBFA1-pH7_3-R2-Time16,hBFA1:pH7.3-R2-T16,hBFA1:pH7.3-R2,hBFA1,pH7.3,R2,2.0,2680456
66,hBFA1-pH7_3-R2-Time24,hBFA1:pH7.3-R2-T24,hBFA1:pH7.3-R2,hBFA1,pH7.3,R2,3.0,2585981


In [28]:
variantsInfo.to_csv(outdir+'variants.csv', index=False)
variantsInfo

Unnamed: 0,barcode_full,barcode_diverse,barcode_environment,total_counts,subpool_environment,which_subpools,putative_environment,barcode_GCratio,neutral_group,control_set
0,ATAAAAAAGCACAAGCCTTTTGACGACACCTAAATTAGTTATCCAT...,ATAAAAAAGCACAAGCCTTTTGACGA,CACCTAAATTAGTTATCCATTCGGCT,9616313,not_read,not_read,SC_2N,0.365385,,False
1,CGATAAACCCACAATATGATTATCGGTACACAAGAGGGTTTCTGTT...,CGATAAACCCACAATATGATTATCGG,TACACAAGAGGGTTTCTGTTTTATGG,6648391,not_read,not_read,SC_alpha,0.384615,,False
2,CGCACAAGAAAGAATAATCTTGAATTGGTAAAACCAGATTTGGCAT...,CGCACAAGAAAGAATAATCTTGAATT,GGTAAAACCAGATTTGGCATTCACTA,3368497,GlyEtOH_alpha,-R1-Subpool.-R1-autodiploids,GlyEtOH_alpha,0.346154,,False
3,ACAAAAAGATATAACAAGCTTGAAGACCCGAAAAAGTTTTTTATCT...,ACAAAAAGATATAACAAGCTTGAAGA,CCCGAAAAAGTTTTTTATCTTCAATG,2082320,FLC4_alpha,-R1-Subpool,FLC4_alpha,0.288462,,False
4,AATAAAAGAAGGAAAAGCATTTAAACAAACAAACTTTCTTTTTTCT...,AATAAAAGAAGGAAAAGCATTTAAAC,AAACAAACTTTCTTTTTTCTTTATGC,2013526,YPD_alpha,-R1-Subpool,contam-pH3_8_2N,0.230769,putative_neutral,True
...,...,...,...,...,...,...,...,...,...,...
2581,CACGCAAAACTTTATTCAATAACTTCGTCCGCCAATCATATTGTAC...,CACGCAAAACTTTATTCAATAACTTCGT,CCGCCAATCATATTGTACTTTCAAGG,267,not_read,not_read,SC_alpha,0.370370,,False
2582,CTATGAAAAGACAACAAGATTGCCTACTATTAACTGATTTTTCTCT...,CTATGAAAAGACAACAAGATTGCCTA,CTATTAACTGATTTTTCTCTTTCGAA,243,pH7_3_alpha,-R2-Subpool,sanger-pH7_3_alpha,0.307692,,False
2583,CGTTAGAATTCTAAACCTGGTTCAACTATTGGAACATCTTTAGTAT...,CGTTAGAATTCTAAACCTGGTTCAACT,ATTGGAACATCTTTAGTATTTAATAC,214,37C_alpha,-R1-Subpool,unknown,0.301887,,False
2584,CCGTCAATCATTAACTTAGTTTCTTACCAGAAATCATATTTTTATT...,CCGTCAATCATTAACTTAGTTTCTTA,CCAGAAATCATATTTTTATTTATTAA,139,21C_alpha,-R1-Subpool,21C_alpha,0.230769,,False


In [29]:
rawCounts.to_csv(outdir+'counts.csv', index=False)
rawCounts

Unnamed: 0,hBFA1:21C-R1-T8,hBFA1:21C-R1-T16,hBFA1:21C-R1-T24,hBFA1:21C-R1-T40,hBFA1:21C-R2-T8,hBFA1:21C-R2-T16,hBFA1:21C-R2-T24,hBFA1:21C-R2-T40,hBFA1:37C-R1-T8,hBFA1:37C-R1-T16,hBFA1:37C-R1-T24,hBFA1:37C-R1-T40,hBFA1:37C-R2-T8,hBFA1:37C-R2-T16,hBFA1:37C-R2-T24,hBFA1:37C-R2-T40,hBFA1:48Hr-R1-T8,hBFA1:48Hr-R1-T16,hBFA1:48Hr-R1-T24,hBFA1:48Hr-R1-T40,hBFA1:48Hr-R2-T8,hBFA1:48Hr-R2-T16,hBFA1:48Hr-R2-T24,hBFA1:48Hr-R2-T40,hBFA1:FLC4-R2-T8,hBFA1:FLC4-R2-T16,hBFA1:FLC4-R2-T24,hBFA1:FLC4-R2-T40,hBFA1:GlyEtOH-R1-T8,hBFA1:GlyEtOH-R1-T16,hBFA1:GlyEtOH-R1-T24,hBFA1:GlyEtOH-R1-T40,hBFA1:GlyEtOH-R2-T8,hBFA1:GlyEtOH-R2-T16,hBFA1:GlyEtOH-R2-T24,hBFA1:GlyEtOH-R2-T40,hBFA1:SC-R1-T8,hBFA1:SC-R1-T16,hBFA1:SC-R1-T24,hBFA1:SC-R1-T40,hBFA1:SC-R2-T8,hBFA1:SC-R2-T16,hBFA1:SC-R2-T24,hBFA1:SC-R2-T40,hBFA1:YPD-R1-T8,hBFA1:YPD-R1-T16,hBFA1:YPD-R1-T24,hBFA1:YPD-R1-T40,hBFA1:YPD-R2-T8,hBFA1:YPD-R2-T16,hBFA1:YPD-R2-T24,hBFA1:YPD-R2-T40,hBFA1:pH3.8-R1-T8,hBFA1:pH3.8-R1-T16,hBFA1:pH3.8-R1-T24,hBFA1:pH3.8-R1-T40,hBFA1:pH3.8-R2-T8,hBFA1:pH3.8-R2-T16,hBFA1:pH3.8-R2-T24,hBFA1:pH3.8-R2-T40,hBFA1:pH7.3-R1-T8,hBFA1:pH7.3-R1-T16,hBFA1:pH7.3-R1-T24,hBFA1:pH7.3-R1-T40,hBFA1:pH7.3-R2-T8,hBFA1:pH7.3-R2-T16,hBFA1:pH7.3-R2-T24,hBFA1:pH7.3-R2-T40
0,18075,34976,73925,379540,19395,33224,67414,473720,36413,72215,120048,323260,35264,76376,128764,395642,58079,106586,153886,503009,90272,138272,185857,392937,8826,690,2310,1302,64526,101861,89724,214115,78087,225592,286933,220946,17610,35231,67572,200094,23871,33046,57404,199577,27936,81462,285233,414060,35589,93249,152324,443061,39397,85853,153919,323697,23365,93624,172839,440061,41107,58049,118324,236402,42332,71938,96247,239779
1,109316,117037,102934,81035,122720,113384,95658,83279,140343,131053,103226,65894,161542,152263,127606,76375,81672,92562,84147,92839,136639,119851,95640,78356,29924,5794,8136,2655,56206,30550,14705,4073,61333,67183,32110,4239,106152,94024,87216,58286,140094,108014,93791,66511,79590,92123,131488,41442,104783,94953,74508,47270,146785,130842,119500,67517,95990,145530,134678,89493,204961,166806,184853,154540,198665,187896,164838,152973
2,6539,4508,4751,3545,7334,4549,4273,3511,7771,6269,9488,11685,12360,7577,5588,13405,5796,4743,18323,22243,12088,1461,18113,13966,3328,204,1385,1180,45206,135292,147221,699111,45299,251626,357161,639549,8667,11719,17075,37643,5744,10672,18433,39563,9170,20739,61359,63627,10914,23059,31036,64681,11494,15972,30293,58740,14528,17241,33489,79132,13665,13724,15892,19474,13758,17580,19743,23223
3,31099,42058,42356,38478,39228,41195,42935,51296,32634,23369,18619,7488,40855,28633,17674,7408,34917,56809,34105,67315,50951,64354,41741,33191,40379,15074,39713,72953,19617,9119,979,1747,20106,25193,11424,1820,23195,22156,21883,18536,28218,26482,27479,23968,18192,26502,43783,15863,23146,28682,23235,21635,24144,31572,35862,35065,28131,34743,44411,42641,35504,28900,33550,29448,39531,37391,32982,28658
4,4250,7508,13340,61474,4878,7708,13778,94633,6576,14705,29220,76955,7438,15378,20278,80289,10741,25236,30171,100566,18520,25521,36808,66914,1721,74,431,258,17158,25083,11029,58151,18737,58559,68481,59882,3482,6305,11151,32390,4746,6657,11499,37696,6188,17762,58020,85510,7344,19560,30739,94122,6019,15079,29731,78756,5652,16425,36098,105253,8471,11161,22970,52473,8942,15606,21789,53481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2581,1,2,4,2,5,6,7,9,7,2,3,2,4,7,3,6,5,7,3,7,4,5,3,5,0,0,0,0,1,1,2,1,3,2,0,0,3,7,6,6,5,4,5,7,1,4,8,7,6,4,2,4,4,5,5,2,2,6,3,3,6,3,3,3,3,7,8,6
2582,3,3,5,1,7,5,5,5,4,0,4,3,2,3,5,4,3,1,5,3,8,11,4,4,0,1,0,0,3,3,0,1,4,1,2,0,6,1,4,1,2,8,6,3,4,5,6,3,5,3,3,4,5,2,3,0,0,5,4,2,4,7,10,3,7,4,6,4
2583,3,5,4,2,3,7,6,3,1,0,3,3,6,4,3,1,2,7,3,6,7,4,5,4,0,0,1,1,5,2,0,0,2,1,0,0,2,3,6,2,1,3,7,3,3,5,6,4,4,5,3,2,0,1,6,6,1,3,6,3,3,4,4,1,3,4,5,1
2584,5,4,4,2,8,6,2,13,0,0,4,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,10,7,0,0,0,2,0,0,0,0,0,0,1,0,0,0,12,7,6,2,10,7,5,1,0,0,0,0,18,0,0,0,0,0,0,0,1,0,0,0
