In [62]:
import pandas as pd
import numpy as np

target = pd.DataFrame(columns=['Sample ID', 'Plate Name', 'Location (Row-Col)', 'Row', 'Col', 'Gene Symbol', 'Gene ID', 'siRNA ID', 'Destination Well', 'Destination Plate Barcode', 'Destination Volume'])
source = pd.read_excel(r"C:\Users\lawsonsa\Downloads\08-03-17 Combined siRNA library.xlsx", sheet_name='0.1 nmol RNAi')

### Provide gene list

In [48]:
#This file's first column must be gene names, and the line of code below requires this column to have the header 'Gene/product'

gene_list = pd.read_excel(r"C:\Users\lawsonsa\Downloads\RNA splicing gene list.xlsx", index_col='Gene/product')
before = len(gene_list)
gene_list = gene_list.index.drop_duplicates()
print(f'{before - len(gene_list)} duplicates removed from the provided gene list')

36 duplicates removed from the provided gene list


### Create a new source plate of only siRNAs targetting our gene list

In [49]:

def make_bool_array(source, gene_list):
    bool_array = []
    for i in range(len(source)):
        if source['Gene Symbol'][i] in gene_list:
            bool_array.append(True)
            
        else:
            bool_array.append(False)
    return bool_array
bool_array = make_bool_array(source, gene_list)
print(sum(bool_array), 'siRNAs targeting provided gene list prior to mapping unmatched genes')

clean_source = source.loc[bool_array]

1569 siRNAs targeting provided gene list prior to mapping unmatched genes


### Find the genes whose siRNAs we were unable to locate in the source library

In [50]:
import mygene

mg = mygene.MyGeneInfo()

mapped_genes = []
unmapped_genes = []
for gene in gene_list:
    if gene not in clean_source['Gene Symbol'].to_list():
        try:
            #print(mg.query(gene, size=1))
            mapped_genes.append(mg.query(gene, size=1)['hits'][0]['entrezgene'])
        except:
            unmapped_genes.append(gene)
mapped_genes = [int(number) for number in mapped_genes]

### Append those to our clean_source df

In [63]:
def sort_by_frequency(df, column):
    '''
    Sort a pandas dataframe based on the frequency of a provided column in that pandas dataframe.
    E.g. if a column has plates, we want to be able to sort the dataframe so that the rows with the least common label appear at the top, and the most common plate appear at the end.
    '''
    # Get the frequency of each label in the column
    frequencies = df[column].value_counts()
    
    # Make a new column with the frequencies
    df['frequency'] = df[column].apply(lambda x: frequencies[x])
    
    # Sort the dataframe by the frequency column
    df.sort_values(by=['frequency', column], inplace=True)
    
    # Drop the frequency column
    df = df.drop(columns='frequency')
    return df

In [66]:
add_to_clean_source = source.loc[source['Gene ID'].isin(mapped_genes)]
source = pd.concat([clean_source, add_to_clean_source])
source = sort_by_frequency(source, 'Plate ID')
print(f'\nGene names in list that were unable to be mapped to the library\n{unmapped_genes}')

The abundance of Plate ID is:
CPE200CD    41
CPE200D2    41
CPE200DR    41
CPE200CE    34
CPE200D3    34
            ..
CPE200BM     1
CPE200BO     1
CPE200BT     1
CPE200BV     1
CPE200AT     1
Name: Plate ID, Length: 159, dtype: int64

Gene names in list that were unable to be mapped to the library
['CICK1', 'I3L521', 'M0R2N4', 'M0R3G1', 'SNRPGP15', 'U2AF1L5']


### Define plate parameters

In [74]:
rows = ['B','C','D','E','F','G','H','I','J','K','L','M','N','O']
cols = [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]
wells = [row+str(col) for row in rows for col in cols]


In [75]:
import math
variable1 = len(wells)
variable2 = source.shape[0]
variable3 = int(math.ceil(variable2 / variable1))
variable4 = (variable1*variable3)-variable2
print(f'Num wells per plate: {variable1}\nNum wells needed for screen: {variable2}\nNum plates needed for screen: {variable3}\nSpare wells: {variable4}')

Num wells per plate: 308
Num wells needed for screen: 1716
Num plates needed for screen: 6
Spare wells: 132


In [45]:
for plate in range(variable3):
    for well in wells:
        if source.shape[0] !=0:
            target = target.append(pd.concat([source.iloc[0, 1:7], pd.Series({'Gene Symbol':source.iloc[0,8]}), source.iloc[0,10:12], pd.Series({'Destination Well':well, 'Destination Plate Barcode':plate+1, 'Destination Volume':''})]), ignore_index=True)
            source = source.iloc[1:]

target.to_csv(r'Echo_instructions_from_gene_list.csv')
target

Unnamed: 0,Sample ID,Plate Name,Location (Row-Col),Row,Col,Gene Symbol,Gene ID,siRNA ID,Destination Well,Destination Plate Barcode,Destination Volume,Plate ID
0,ASO0FPUI,Hm Gen Ext siRNA Lib-A1,G10,G,10,FAU,2197.0,s5039,B3,1,,CPE2009C
1,ASO0FPV0,Hm Gen Ext siRNA Lib-A1,K2,K,2,AFF2,2334.0,s5318,B4,1,,CPE2009C
2,ASO0FPV2,Hm Gen Ext siRNA Lib-A1,K6,K,6,FRG1,2483.0,s5366,B5,1,,CPE2009C
3,ASO0FQ24,Hm Gen Ext siRNA Lib-A1,L4,L,4,HNRNPF,3185.0,s6725,B6,1,,CPE2009C
4,ASO0FQ25,Hm Gen Ext siRNA Lib-A1,L6,L,6,HNRNPH1,3187.0,s6728,B7,1,,CPE2009C
...,...,...,...,...,...,...,...,...,...,...,...,...
3280,ASO0GFU0,Hm Gen Ext siRNA Lib-C9,M1,M,1,PRPF38B,55119.0,s30217,J7,6,,CPE200BA
3281,ASO0GFYQ,Hm Gen Ext siRNA Lib-C9,M14,M,14,INTS8,55656.0,s31181,J8,6,,CPE200BA
3282,ASO0GFYR,Hm Gen Ext siRNA Lib-C9,M18,M,18,PRPF40A,55660.0,s31193,J9,6,,CPE200BA
3283,ASO0GFZ0,Hm Gen Ext siRNA Lib-C9,O16,O,16,IWS1,55677.0,s31232,J10,6,,CPE200BA
