# Growth Rate Calculations for the Pairwise Library

The following code converts raw sgRNA counts to relative growth rates for the entire pairwise sgRNA library. This is performed by normalizing raw counts to relative counts and fitting a line to the log$_2$(Relative Counts) vs. Generations relationship. The code also removes escapers, averages replicates, and rescales growth rates to an intuitive, non-negative scale.

6/28/22 - Ryan Otto

### Import packages and libraries

In [None]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.metrics import r2_score
import plot_defaults
plot_defaults.change_defaults()

### Define relevant variables

In [None]:
date = '220815'  # Date for output files
figure_path = 'Figures'  # Path for figure output
file_path = 'intermediate_files'  # Path for data files
input_path = 'input_files'
TPs = [0, 2, 4, 6, 8, 10, 12, 14]
gene_names = ['dapA', 'dapB', 'gdhA', 'gltB', 'folA', 'thyA', 'glyA', 'purN', 'purL']
BC_list = ['TGAAAG', 'CCATGC', 'CATGAT']

### Import data 

In [None]:
with open(f'{input_path}/turbidostat_GR.pickle', 'rb') as handle:
    turb_gr = pickle.load(handle)
gr_TPs = np.array([x*turb_gr['pairwise'] for x in TPs])  # Timepoints rescaled to generations
with open(f'{file_path}/220815_qscore30_sgRNA_counts.pickle', 'rb') as handle:
    pairwise_dict = pickle.load(handle)  # sgRNA counts
with open(f'{file_path}/220815_repression_mean.pickle', 'rb') as handle:
    qPCR_vals = pickle.load(handle)  # qPCR data
with open(f'{file_path}/220815_repression_sem.pickle', 'rb') as handle:
    qPCR_sem = pickle.load(handle)  # qPCR data
sgRNA_list = ['negC_rand_42']
for gene in gene_names:
    # For each gene, sort sgRNA names by their CRISPRi repression strength against their target
    match_sgRNAs = [sg for sg in sorted(qPCR_vals[gene], key=qPCR_vals[gene].get) if sg != 'negC_rand_42']
    sgRNA_list += match_sgRNAs
with open(f'{input_path}/20200923_glu_growth_rates_TableS5.csv', 'rb') as handle:
    prior_gr_data = pd.read_csv(handle, index_col=[0])  # Prior data

### Define analysis functions

In [None]:
def growth_rate(sgRNA1, sgRNA2, gr_dict, count_dict, TPs_hours, TPs_gens, BC_ID):
    """Calculates growth rate for a given barcoded sgRNA construct
    Arguments:
    sgRNA1: Name of position 1 sgRNA
    sgRNA2: Name of position 2 sgRNA
    BC_ID: Barcode identity
    gr_dict: Dictionary containing log2-transformed relative frequencies
    count_dict: Dictionary contining CRISPRi construct counts at each timepoint
    TPs_hours: Timepoints (hours)
    TPs_gens: Timepoints (generations)
    Returns:
    m: Growth rate (slope) of CRISPRi construct
    b: Intercept of log2(relative frequency) to generations best fit line
    """
    rel_freq = np.array([gr_dict[TP][BC_ID].loc[sgRNA1, sgRNA2] for TP in TPs_hours])
    counts = np.array([count_dict[TP][BC_ID].loc[sgRNA1, sgRNA2] for TP in TPs_hours])
    for i, count in enumerate(counts):
        if count < 10:  # If counts fall below our minimum threshold, don't count subsequent timepoints
            if ~np.isnan(rel_freq[i]):  # If the construct isn't absent, include this point and no future ones
                rel_freq = rel_freq[:i + 1]
                TPs_gens = TPs_gens[:i + 1]
            else:  # If counts are zero, don't include this point or future ones
                rel_freq = rel_freq[:i]
                TPs_gens = TPs_gens[:i]
            break
    if len(TPs_gens) >= 3:  # Don't fit fewer than three points
        m, b, _, _, _ = stats.linregress(TPs_gens, rel_freq)
        return m, b
    else:
        return np.nan, np.nan


def plot_correlation(x_list, y_list, x_label, y_label, min_value, max_value, file_name=None, path=None):
    """Function for making standard correlation plots
    Arguments:
    x_list: x-axis data coordinates
    y_list: y-axis data coordinates
    x_label: x-axis label
    y_label: y-axis label
    min_value: Minimum value on graph
    max_value: Maximum value on graph
    file_name: Name of saved file
    path: Path for figure output
    """
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(x_list, y_list, color='xkcd:dark gray', alpha=0.1)
    ax.plot([min_value, max_value], [min_value, max_value], ls='--', color='xkcd:gray', lw=1, zorder=0)
    ax.set_xlim(min_value, max_value)
    ax.set_ylim(min_value, max_value)
    ax.set_xlabel(x_label, fontsize=16)
    ax.set_ylabel(y_label, fontsize=16)
    x_list, y_list = np.array(x_list), np.array(y_list)
    mask1 = ~np.isnan(x_list) & ~np.isnan(y_list)  # Mask for shared values
    x_masked, y_masked = x_list[mask1], y_list[mask1]
    ax.text(max_value-0.5, min_value+0.1, f'R$^2$ = {np.round(r2_score(x_masked, y_masked), 2)}', fontsize=16)
    if file_name:
        plt.savefig(f'{path}/{file_name}.pdf')
    plt.show()
    return


def qtest_dixon(replicates):
    """One-sided Dixon Q test at 95% confidence: Statistical test used for identyfing outliers in data set
    https://www.philadelphia.edu.jo/academics/ajaber/uploads/0501522-Chapter%203-Statiscal%20tests.pdf
    Used here to identify CRISPRi escapers: replicates with abnormally fast growth rates
    Argument:
    replicates: List of growth rate replicates to test for an outlier
    Returns:
    rep_sorted[:-1] or replicates: If an escaper was found, removes it. Otherwise, returns all replicates
    rep_sorted[-1] or np.nan: If an escaper was found, returns the escaper. Otherwise, returns nan.
    """
    Q30 = [0, 0, 0.941, 0.765, 0.642, 0.560]  # One-sided values, 95% confidence
    if len(replicates) >= 4:
        rep_sorted = sorted(replicates)
        gap = abs(rep_sorted[-2] - rep_sorted[-1])
        rep_range = rep_sorted[-1] - rep_sorted[0]
        if rep_range != 0:  # If all values are equal, can't run this test
            Q_val = gap / rep_range
            if Q_val >= Q30[len(replicates)-1]:
                return rep_sorted[:-1], rep_sorted[-1]
    return replicates, np.nan

### Calculate relative sgRNA frequency and growth rates

We first normalize raw counts by the nontargeting construct at each timepoint. Then we normalize relative frequencies at every timepoint by the relative frequency at $T_0$. For all constructs with counts present through the first three timepoints, we fit a line to $log_2$(relative frequency) vs. time data. The slope of this line is the construct's relative growth rate effect.

$relative frequency_{a,T}$ = $\frac{counts_{a,T} / counts_{Nont,T}}{counts_{a,T_0} / counts_{Nont,T_0}}$

In [None]:
nont_norm_dict, T0_norm_dict, df_normed = {}, {}, {}
for TP in TPs:
    nont_norm_dict[TP], T0_norm_dict[TP], df_normed[TP] = {}, {}, {}
    for BC in BC_list:
        nont_norm_dict[TP][BC] = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list,
                                         sgRNA_list)
        T0_norm_dict[TP][BC] = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list,
                                            sgRNA_list)
        df_normed[TP][BC] = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
        for sgRNA1 in sgRNA_list:
            for sgRNA2 in sgRNA_list:
                # Normalize to Nontargeting control
                nont_norm_dict[TP][BC].loc[sgRNA1, sgRNA2] = pairwise_dict[TP][BC].loc[sgRNA1, sgRNA2] \
                                                           / pairwise_dict[TP][BC].loc['negC_rand_42', 'negC_rand_42']
                # Normalize to T0
                if nont_norm_dict[0][BC].loc[sgRNA1, sgRNA2] != 0:
                    T0_norm_dict[TP][BC].loc[sgRNA1, sgRNA2] = nont_norm_dict[TP][BC].loc[sgRNA1, sgRNA2] \
                                                             / nont_norm_dict[0][BC].loc[sgRNA1, sgRNA2]
                # log2 transformation
                if T0_norm_dict[TP][BC].loc[sgRNA1, sgRNA2] != 0:
                    df_normed[TP][BC].loc[sgRNA1, sgRNA2] = np.log2(T0_norm_dict[TP][BC].loc[sgRNA1, sgRNA2])

In [None]:
growth_dict_BC, b_dict_BC = {}, {}
for BC in BC_list:
    growth_dict_BC[BC] = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
    b_dict_BC[BC] = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
    for sgRNA1 in sgRNA_list:
        for sgRNA2 in sgRNA_list:
            growth_dict_BC[BC].loc[sgRNA1, sgRNA2], b_dict_BC[BC].loc[sgRNA1, sgRNA2] = \
                                growth_rate(sgRNA1, sgRNA2, df_normed, pairwise_dict, TPs, gr_TPs, BC)

In [None]:
sorted_colors = ['xkcd:dark gray', 'xkcd:cherry red', 'xkcd:sky blue', 'xkcd:forest green', 'xkcd:turquoise',
                 'xkcd:purple', 'xkcd:grass green', 'xkcd:sea blue', 'xkcd:dark yellow', 'xkcd:lilac']
fig, ax = plt.subplots(figsize=(5, 5))
thyA_sgRNA = [x for x in sgRNA_list if x == 'negC_rand_42' or 'thyA' in x]  # Extract thyA-targeting sgRNAs
for i, sgRNA in enumerate(thyA_sgRNA):
    rel_freq = [df_normed[TP][BC_list[0]].loc['negC_rand_42', sgRNA] for TP in TPs]  # Extract frequencies over time
    ax.errorbar(gr_TPs, rel_freq, color=sorted_colors[i], fmt='o')
    ax.plot(gr_TPs, [x*growth_dict_BC[BC_list[0]].loc['negC_rand_42', sgRNA] 
                   + b_dict_BC[BC_list[0]].loc['negC_rand_42', sgRNA] for x in gr_TPs], color=sorted_colors[i])
ax.text(0, -11.5, '$thyA$', fontsize=16)
ax.set_xlabel('Generations', fontsize=16)
ax.set_ylabel('log$_{2}$(Relative Frequency)', fontsize=16)
ax.set_xticks([0, 5, 10])
ax.set_yticks([-10, -5, 0])
plt.tight_layout()
plt.savefig(f'{figure_path}/Fig1B.pdf')
plt.show()

## Remove hand-annotated off-target sgRNAs

Two sgRNAs targeting gdhA, a nonessential gene in M9, showed significant growth defects across most backgrounds. Removing these sgRNAs prior to future analysis. The first sgRNA shows extremely inconsistent growth rate effects, due to a combination of off-target effects and low sgRNA numbers during sequencing. The second sgRNA shows a consistent growth defect, and by investigating "nearby" sgRNAs (those with nearly matches sequence), this sgRNA is a clear outlier attributable to off-target effects.

In [None]:
# Note: This code can't be run multiple times -- it removes the data it calls
for off_target in ['gdhA_1_42_B_MM14', 'gdhA_3_216_B_MM8']:
    fig, ax = plt.subplots(figsize=(5, 5))
    for BC in BC_list:
        ax.scatter(gr_TPs, [df_normed[TP][BC].loc[off_target, 'negC_rand_42'] for TP in TPs])
        ax.plot(gr_TPs, [x*growth_dict_BC[BC].loc[off_target, 'negC_rand_42']
                       + b_dict_BC[BC].loc[off_target, 'negC_rand_42'] for x in gr_TPs])  # Plot best fit line
        ax.scatter(gr_TPs, [df_normed[TP][BC].loc['negC_rand_42', off_target] for TP in TPs])
        ax.plot(gr_TPs, [x*growth_dict_BC[BC].loc['negC_rand_42', off_target]
                       + b_dict_BC[BC].loc['negC_rand_42', off_target] for x in gr_TPs])  # Plot best fit line
    ax.set_xticks([0, 5, 10])
    ax.set_xlabel('Generations', fontsize=14)
    ax.set_ylabel('log$_{2}$(Relative Frequency)', fontsize=14)
    ax.set_title(f'{off_target}-negC_rand_42', fontsize=14)
    plt.show()
    for BC in BC_list:
        growth_dict_BC[BC].drop(labels=off_target, axis=0, inplace=True)  # Remove sgRNA
        growth_dict_BC[BC].drop(labels=off_target, axis=1, inplace=True)  # Remove sgRNA
    sp = off_target.split('_')
    sgRNA_list.remove(off_target)  # Remove sgRNA

### Visualize example escaper

We chose one sgRNA pair that was identified during escaper correction to visualize. Each replicate is plotted individually. The replicate flagged as an escaper is plotted in red, and the replicates that passed escaper filtering are plotted in gray.

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
colors = ['xkcd:gray', 'xkcd:gray', 'xkcd:gray', 'xkcd:gray', 'xkcd:gray', 'r']
sgRNA1 = 'purL_1_25_B_MM14'
sgRNA2 = 'thyA_3_233_B_MM4'
for i, BC in enumerate(BC_list):
    ax.scatter(gr_TPs, [df_normed[TP][BC].loc[sgRNA1, sgRNA2] for TP in TPs], s=80, color=colors[2*i])
    ax.plot(gr_TPs, [x*growth_dict_BC[BC].loc[sgRNA1, sgRNA2] + b_dict_BC[BC].loc[sgRNA1, sgRNA2] for x in gr_TPs],
            color=colors[2*i])  # Plot best fit line
    ax.scatter(gr_TPs, [df_normed[TP][BC].loc[sgRNA2, sgRNA1] for TP in TPs], s=80, color=colors[2*i+1])
    ax.plot(gr_TPs, [x*growth_dict_BC[BC].loc[sgRNA2, sgRNA1] + b_dict_BC[BC].loc[sgRNA2, sgRNA1] for x in gr_TPs],
            color=colors[2*i+1])  # Plot best fit line
ax.set_title(f'{sgRNA1} + {sgRNA2}', fontsize=18)
ax.set_xlabel('Generations', fontsize=18)
ax.set_ylabel('log$_{2}$(Relative Frequency)', fontsize=18)
plt.tight_layout()
plt.savefig(f'{figure_path}/FigS3A.pdf')
plt.show()

### Compare growth rates calculated from 14 hours of data vs. 10 hours

To optimize future experiments, we checked to see how much growth rate information was contained in the last two timepoints of the 14-hour experiment. To do this, we calculated growth rates using only the first 10 hours of data, using the same approach described above. Broadly speaking, there is not a significant change when downsampling the data in this way. Growth rates under ~0.5 are unchanged as these sgRNA constructs were depleted from the library at or after the 10-hour timepoint.

In [None]:
growth_dict_BC_10, b_dict_BC_10 = {}, {}
for BC in BC_list:
    growth_dict_BC_10[BC] = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
    for sgRNA1 in sgRNA_list:
        for sgRNA2 in sgRNA_list:
            growth_dict_BC_10[BC].loc[sgRNA1, sgRNA2], _ = \
                                growth_rate(sgRNA1, sgRNA2, df_normed, pairwise_dict, TPs[:-2], gr_TPs[:-2], BC)

In [None]:
full_gr, short_gr = [], []
for BC in BC_list:
    for sgRNA1 in sgRNA_list:
        for sgRNA2 in sgRNA_list:
            if ~np.isnan(growth_dict_BC[BC].loc[sgRNA1, sgRNA2]) and \
               ~np.isnan(growth_dict_BC_10[BC].loc[sgRNA1, sgRNA2]):  # Ensure both values are present
                full_gr.append(growth_dict_BC[BC].loc[sgRNA1, sgRNA2])
                short_gr.append(growth_dict_BC_10[BC].loc[sgRNA1, sgRNA2])
full_gr = (full_gr-min(full_gr)) / abs(min(full_gr))  # Rescale
short_gr = (short_gr-min(short_gr)) / abs(min(short_gr))  # Rescale
plot_correlation(short_gr, full_gr, 'GR - 14 Hour Fit', 'GR - 10 Hour Fit', 0, 1.33, 'FigS13', figure_path)

### Correlate growth rate data between barcodes and sgRNA orders

Internal replicates are barcoded, and sgRNA constructs should have comparable effects regardless of the barcode used. We see that this holds; the increase in dispersion at lower growth rates is expected as these data are fit from fewer sgRNA counts, and thus more susceptible to noise.

We then ensured that changing sgRNA order (whether a given sgRNA is in position 1 or position 2 in the construct) did not have a meaningful effect on growth rate. We see this holds as well. Note that the decrease in variation between sgRNA orders (when compared to deviation between barcodes) is expected, as the sgRNA order measurements are averaged across three barcoded replicates.

In [None]:
BC1, BC2, BC3 = [], [], []
for sgRNA1 in sgRNA_list:
    for sgRNA2 in sgRNA_list:
        BC1.append(growth_dict_BC[BC_list[0]].loc[sgRNA1, sgRNA2])
        BC2.append(growth_dict_BC[BC_list[1]].loc[sgRNA1, sgRNA2])
        BC3.append(growth_dict_BC[BC_list[2]].loc[sgRNA1, sgRNA2])
low_bound = np.nanmin(BC1 + BC2 + BC3) * 1.05
high_bound = np.nanmax(BC1 + BC2 + BC3) * 1.05
plot_correlation(BC1, BC2, 'BC1', 'BC2', low_bound, high_bound, 'FigS2A', figure_path)
plot_correlation(BC1, BC3, 'BC1', 'BC3', low_bound, high_bound, 'FigS2B', figure_path)
plot_correlation(BC2, BC3, 'BC2', 'BC3', low_bound, high_bound, 'FigS2C', figure_path)

In [None]:
sg1_sg2, sg2_sg1 = [], []
for i, sgRNA1 in enumerate(sgRNA_list):
    for sgRNA2 in sgRNA_list[i+1:]:  # Only take non-redundant sgRNAs
        temp_order1 = np.array([growth_dict_BC[BC_list[0]].loc[sgRNA1, sgRNA2],
                                growth_dict_BC[BC_list[1]].loc[sgRNA1, sgRNA2],
                                growth_dict_BC[BC_list[2]].loc[sgRNA1, sgRNA2]])  # All barcodes, order 1
        temp_order2 = np.array([growth_dict_BC[BC_list[0]].loc[sgRNA2, sgRNA1],
                                growth_dict_BC[BC_list[1]].loc[sgRNA2, sgRNA1],
                                growth_dict_BC[BC_list[2]].loc[sgRNA2, sgRNA1]])  # All barcodes, order 2
        if sum(~np.isnan(temp_order1)) >= 2 and sum(~np.isnan(temp_order2)) >= 2:  # Check for sufficient data
            sg1_sg2.append(np.nanmean(temp_order1))
            sg2_sg1.append(np.nanmean(temp_order2))
low_bound = np.nanmin(sg1_sg2 + sg2_sg1) * 1.02
high_bound = np.nanmax(sg1_sg2 + sg2_sg1) * 1.2
plot_correlation(sg1_sg2, sg2_sg1, 'sgRNA1-sgRNA2', 'sgRNA2-sgRNA1', low_bound, high_bound, 'FigS2D', figure_path)

### Pool data by barcode and sgRNA order, remove escapers, and rescale growth rate data

We first pool growth rates by barcode and sgRNA order, then remove escapers using a one-sided Dixon Q-test at 95% confidence. After removing escapers, we average all constructs with at least four barcoded measurements remaining. Finally, we normalize each construct's mean growth rate by the minimum growth rate observed in the library, which rescales the nontargeting construct's growth rate to 1 and makes all growth rates non-negative.

In [None]:
dict_growth_pool = {}
for i, sgRNA1 in enumerate(sgRNA_list):
    dict_growth_pool[sgRNA1] = {}
    for sgRNA2 in sgRNA_list[i:]:
        dict_growth_pool[sgRNA1][sgRNA2] = np.array([])
for i, sgRNA1 in enumerate(sgRNA_list):
    for sgRNA2 in sgRNA_list[i:]:
        for BC in BC_list:
            dict_growth_pool[sgRNA1][sgRNA2] = np.append(dict_growth_pool[sgRNA1][sgRNA2],
                                                         growth_dict_BC[BC].loc[sgRNA1, sgRNA2])  # Pool by barcode
            if sgRNA1 != sgRNA2:  # If the construct has two distinct sgRNAs, pool by order
                dict_growth_pool[sgRNA1][sgRNA2] = np.append(dict_growth_pool[sgRNA1][sgRNA2],
                                                         growth_dict_BC[BC].loc[sgRNA2, sgRNA1])
        dict_growth_pool[sgRNA1][sgRNA2] = \
            dict_growth_pool[sgRNA1][sgRNA2][~np.isnan(dict_growth_pool[sgRNA1][sgRNA2])]  # Remove nans

In [None]:
df_growth_pool = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
df_growth_pool_sem = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
df_growth_pool_std = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
df_growth_pool_filt = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
df_growth_pool_filt_sem = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
df_growth_pool_filt_std = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
escaper_dict = {}
for i, sgRNA1 in enumerate(sgRNA_list):
    for sgRNA2 in sgRNA_list[i:]:
        gr_values = dict_growth_pool[sgRNA1][sgRNA2]
        if sgRNA1 != sgRNA2:  # Most constructs are of two distinct sgRNAs -- we require 4 replicates to fit
            reps_required = 4
        else:  # For these constructs, we only have 3 replicates -- require 2 to fit
            reps_required = 2
        if len(gr_values) >= reps_required:
            filt_gr_values, escaper_val = qtest_dixon(gr_values)  # Remove escapers
            if ~np.isnan(escaper_val):
                escaper_dict[f'{sgRNA1}-{sgRNA2}'] = escaper_val  # Record escapers
            df_growth_pool.loc[sgRNA2, sgRNA1] = np.mean(gr_values)
            df_growth_pool_sem.loc[sgRNA2, sgRNA1] = stats.sem(gr_values)
            df_growth_pool_std.loc[sgRNA2, sgRNA1] = np.std(gr_values)
            df_growth_pool_filt.loc[sgRNA2, sgRNA1] = np.mean(filt_gr_values)
            df_growth_pool_filt_sem.loc[sgRNA2, sgRNA1] = stats.sem(filt_gr_values)
            df_growth_pool_filt_std.loc[sgRNA2, sgRNA1] = np.std(filt_gr_values)
total_measurements = sum([sum(~np.isnan(growth_dict_BC[BC].values.flatten())) for BC in BC_list])
print(f'{len(escaper_dict)} escapers')
print(f'{total_measurements} total measurements')
print(f'{np.round(len(escaper_dict)/total_measurements*100, 3)}% dropped')

In [None]:
gr_min = np.nanmin(df_growth_pool_filt.values)  # Minimum observed growth rate
df_growth_pool_filt_rescale = pd.DataFrame((df_growth_pool_filt.values - gr_min)/abs(gr_min), sgRNA_list, sgRNA_list)
df_growth_pool_filt_sem_rescale = pd.DataFrame(df_growth_pool_filt_sem.values/abs(gr_min), sgRNA_list, sgRNA_list)
df_growth_pool_filt_std_rescale = pd.DataFrame(df_growth_pool_filt_std.values/abs(gr_min), sgRNA_list, sgRNA_list)
simple_epistasis = pd.DataFrame(np.full([len(sgRNA_list), len(sgRNA_list)], np.nan), sgRNA_list, sgRNA_list)
for i, sgRNA1 in enumerate(sgRNA_list):
    for sgRNA2 in sgRNA_list[0:i]:  # Only use values above diagonal
        simple_epistasis.loc[sgRNA2, sgRNA1] = df_growth_pool_filt_rescale.loc[sgRNA1, sgRNA2] \
      - df_growth_pool_filt_rescale.loc[sgRNA1, 'negC_rand_42'] \
      * df_growth_pool_filt_rescale.loc[sgRNA2, 'negC_rand_42']

### Plot escaper effects on growth rate mean and growth rate sem

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax.scatter(df_growth_pool, df_growth_pool_filt, marker='o', color='xkcd:dark gray')
ax.plot([-1.5, 0.4], [-1.5, 0.4], ls='--', color='xkcd:dark gray', zorder=0)
ax.text(-0.5, -1.4, 'GR Mean', fontsize=16)
ax.set_xlim(-1.45, 0.3)
ax.set_ylim(-1.45, 0.3)
ax.set_xticks([-1, -0.5, 0])
ax.set_yticks([-1, -0.5, 0])
ax.set_xlabel('No Correction', fontsize=16)
ax.set_ylabel('Escaper Corrected', fontsize=16)
ax.spines[['top', 'right']].set_visible(False)
plt.tight_layout()
plt.savefig(f'{figure_path}/FigS3B.pdf')
plt.show()

fig, ax = plt.subplots(figsize=(4, 4))
ax.scatter(df_growth_pool_sem, df_growth_pool_filt_sem, marker='o', color='xkcd:dark gray')
ax.plot([-0.05, 0.4], [-0.05, 0.4], ls='--', color='xkcd:dark gray', zorder=0)
ax.set_xlim(-0.02, 0.4)
ax.set_ylim(-0.02, 0.4)
ax.set_xticks([0, 0.2, 0.4])
ax.set_yticks([0, 0.2, 0.4])
ax.set_xlabel('No Correction', fontsize=16)
ax.set_ylabel('Escaper Corrected', fontsize=16)
ax.text(0.23, 0, 'GR SEM', fontsize=16)
ax.spines[['top', 'right']].set_visible(False)
plt.tight_layout()
plt.savefig(f'{figure_path}/FigS3C.pdf')
plt.show()

### Plot growth rates following pairwise CRISPRi treatment

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
im = ax.imshow(df_growth_pool_filt_rescale, vmin=0, vmax=2, cmap='RdBu_r')
fig.colorbar(im, shrink=0.3)
ax.tick_params(axis='both', labelsize=20)
ax.set_facecolor('xkcd:light gray')
ax.grid(which='minor', color='xkcd:dark gray', linestyle='-', linewidth=0.25)
grid_lines = [-0.5, 0.5]  # Set up grids to divide sgRNA groups by the gene they target
labels = ['Nont', sgRNA_list[1].split('_')[0]]  # Target gene names
ax.hlines(0.5, -0.5, len(sgRNA_list)-0.5, color='xkcd:dark gray')  # Nont sgRNA
ax.vlines(0.5, -0.5, len(sgRNA_list)-0.5, color='xkcd:dark gray')  # Nont sgRNA
for i, sgRNA in enumerate(sgRNA_list[1:]):
    if sgRNA.split('_')[0][0:4] != labels[-1]:  # Switching to a new sgRNA group
        grid_lines.append(i+0.5)
        labels.append(sgRNA.split('_')[0])
        ax.hlines(i+0.5, -0.5, len(sgRNA_list)-0.5, color='xkcd:dark gray')
        ax.vlines(i+0.5, -0.5, len(sgRNA_list)-0.5, color='xkcd:dark gray')
    ax.hlines(i+0.5, i+0.5, i+1.5, color='xkcd:dark gray')  # Lines just across diagonal
    ax.vlines(i+1.5, i+0.5, i+1.5, color='xkcd:dark gray')  # Lines just across diagonal
grid_lines.append(i+1.5)
centers = []  # Center of each sgRNA group
for i, grid in enumerate(grid_lines[:-1]):
    centers.append((grid+grid_lines[i+1]) / 2)
ax.set_xticks(centers)
ax.set_yticks(centers)
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)
ax.spines[['top', 'right']].set_visible(True)
plt.tight_layout()
plt.savefig(f'{figure_path}/Fig3GR.pdf')
plt.show()

### Plot growth rate epistasis for the pairwise CRISPRi library

We calculated growth rate epistasis using a multiplicative model using the following formula.

$epi_{a,b}$ = $gr_{a,b}$ - $gr_a$ * $gr_b$

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
im = ax.imshow(simple_epistasis, vmin=-1, vmax=1, cmap='PuOr_r')
fig.colorbar(im, shrink=0.3)
ax.set_facecolor('xkcd:light gray')
ax.spines[['bottom', 'left']].set_visible(False)
ax.set_yticks([])  # Ticks, grid, and labels are brought in with lower left portion of figure
ax.set_xticks([])
plt.tight_layout()
plt.savefig(f'{figure_path}/Fig3Epi.pdf')
plt.show()

### Interreplicate RMSD

We calculate RMSD between experimental replicates by taking the error between each individual measurement and the mean growth rate across all replicates of that construct. This RMSD is assumed to be the theoretical minimum error for predicted growth rates.

In [None]:
sq_errs = []
for BC in BC_list:
    for i, sgRNA1 in enumerate(sgRNA_list):
        for sgRNA2 in sgRNA_list[i+1:]:  # Non-redundant constructs
            if sgRNA1.split('_')[0] != sgRNA2.split('_')[0]:  # We don't investigate same gene sgRNAs
                sq_errs.extend([((growth_dict_BC[BC].loc[sgRNA1, sgRNA2]-gr_min)/abs(gr_min)
                                - df_growth_pool_filt_rescale.loc[sgRNA2, sgRNA1])**2, 
                               ((growth_dict_BC[BC].loc[sgRNA2, sgRNA1]-gr_min)/abs(gr_min)
                                - df_growth_pool_filt_rescale.loc[sgRNA2, sgRNA1])**2])
data_RMSD = np.sqrt(np.nanmean(sq_errs))
print(f'Data RMSD: {data_RMSD}')

### Comparison to previous CRISPRi growth rate data

We compared growth rates for single CRISPRi perturbations from this library (targeting sgRNA + nontargeting sgRNA) and data published in Mathis et al. 2021.

In [None]:
prior_data_dict = {}
for shared_sgRNA in set(sgRNA_list).intersection(prior_gr_data.index):  # Use only shared sgRNAs
    prior_data_dict[shared_sgRNA] = [prior_gr_data['mean'].loc[shared_sgRNA], prior_gr_data['sem'].loc[shared_sgRNA]]
prior_gr_min = np.nanmin(prior_gr_data['mean'].values)  # Minimum from the prior experiment
pairwise_mean, pairwise_sem, prior_mean, prior_sem = np.array([]), [], np.array([]), []
for shared_sgRNA in set(sgRNA_list).intersection(prior_gr_data.index):  # Use only shared sgRNAs
    if all(~np.isnan([df_growth_pool_filt_rescale.loc[shared_sgRNA, 'negC_rand_42'], 
                      prior_data_dict[shared_sgRNA][0]])):  # If both values are present
        pairwise_mean = np.append(pairwise_mean, df_growth_pool_filt_rescale.loc[shared_sgRNA, 'negC_rand_42'])
        pairwise_sem.append(df_growth_pool_filt_sem_rescale.loc[shared_sgRNA, 'negC_rand_42' ])
        # Rescale
        prior_mean = np.append(prior_mean, (prior_data_dict[shared_sgRNA][0]-prior_gr_min)/abs(prior_gr_min))
        prior_sem.append(prior_data_dict[shared_sgRNA][1]/abs(prior_gr_min))

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
ax.errorbar(pairwise_mean, prior_mean, xerr=pairwise_sem, yerr=prior_sem, ls='None', ms=10, marker='o',
             color='xkcd:dark gray', elinewidth=2)
ax.plot([0, 1.15], [0, 1.15], '--', color='xkcd:gray', zorder=0)
ax.text(0.7, 0.1, f'R$^2$ = {np.round(r2_score(pairwise_mean, prior_mean), 2)}', fontsize=18)
ax.set_xlabel('Relative GR Current Work', fontsize=20)
ax.set_ylabel('Relative GR Mathis et al., 2021', fontsize=20)
ax.set_title('Growth Rate Replicates', fontsize=24)
ax.set_ylim(0, 1.15)
ax.set_xlim(0, 1.15)
ax.set_xticks([0, 0.5, 1])
ax.set_yticks([0, 0.5, 1])
plt.tight_layout()
plt.savefig(f'{figure_path}/FigS4.pdf')
plt.show()

### Generate tables and export data

In [None]:
#  Table S2. Single sgRNA repression intensities and growth rates.
table_s2 = pd.DataFrame(np.full((len(sgRNA_list)-1, 4), np.nan), index=sgRNA_list[1:],
                        columns=['Repression', 'Repression SEM', 'Growth Rate', 'Growth Rate SEM'])
for sgRNA in table_s2.index:
    table_s2.loc[sgRNA] = [qPCR_vals[sgRNA.split('_')[0]][sgRNA], qPCR_sem[sgRNA.split('_')[0]][sgRNA],
                           df_growth_pool_filt_rescale.loc[sgRNA, 'negC_rand_42'],
                           df_growth_pool_filt_sem_rescale.loc[sgRNA, 'negC_rand_42']]
#  Table S3. Pairwise sgRNA growth rates.
table_s3 = pd.DataFrame(np.full((int(((len(sgRNA_list)-1)*(len(sgRNA_list)-1))/2), 5), np.nan),
                        columns=['sgRNA1', 'sgRNA2', 'Growth Rate', 'Growth Rate SEM', 'Epistasis'])
count = 0
for i, sgRNA1 in enumerate(sgRNA_list[1:]):
    for sgRNA2 in sgRNA_list[i+1:]:
        if sgRNA1 == sgRNA2:
            table_s3.loc[count] = [sgRNA1, sgRNA2, df_growth_pool_filt_rescale.loc[sgRNA2, sgRNA1],
                                   df_growth_pool_filt_sem_rescale.loc[sgRNA2, sgRNA1], np.nan]
        else:
            table_s3.loc[count] = [sgRNA1, sgRNA2, df_growth_pool_filt_rescale.loc[sgRNA2, sgRNA1],
                            df_growth_pool_filt_sem_rescale.loc[sgRNA2, sgRNA1], simple_epistasis.loc[sgRNA1, sgRNA2]]
        count += 1

In [None]:
with open(f'{file_path}/{date}_df_growth_pool_filt.pickle', 'wb') as handle:
    pickle.dump(df_growth_pool_filt, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{file_path}/{date}_df_growth_pool_filt_sem.pickle', 'wb') as handle:
    pickle.dump(df_growth_pool_filt_sem, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{file_path}/{date}_df_growth_pool_filt_std.pickle', 'wb') as handle:
    pickle.dump(df_growth_pool_filt_std, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{file_path}/{date}_df_growth_pool_filt_rescale.pickle', 'wb') as handle:
    pickle.dump(df_growth_pool_filt_rescale, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{file_path}/{date}_df_growth_pool_filt_sem_rescale.pickle', 'wb') as handle:
    pickle.dump(df_growth_pool_filt_sem_rescale, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{file_path}/{date}_df_growth_pool_filt_std_rescale.pickle', 'wb') as handle:
    pickle.dump(df_growth_pool_filt_std_rescale, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{file_path}/{date}_raw_data_real_rmsd.pickle', 'wb') as handle:
    pickle.dump(data_RMSD, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{file_path}/{date}_pairwise_gr_min.pickle', 'wb') as handle:
    pickle.dump(gr_min, handle, protocol=pickle.HIGHEST_PROTOCOL)
with pd.ExcelWriter(f'Supplementary_Tables.xlsx', mode='a', if_sheet_exists='replace') as writer:  
    table_s2.to_excel(writer, sheet_name='Table S2')
    table_s3.to_excel(writer, sheet_name='Table S3')