In [1]:
## Notebook env: seq_trblsht (seq_trblsht kernel)
## This notebook is used to analyze the fraction of reads that map to the guide region in GEM1 of the CRISPRa library

In [2]:
%%bash
cd /home/ssobti/projects/heterogeneity_brian/output_data/CRISPRi_CRISPRa_cell_culture_screen/CRISPRi_STAR_guide_reads_alignment
nohup STAR --outSAMtype BAM SortedByCoordinate --readFilesCommand zcat --runThreadN 16 --genomeDir /home/genomes/hg38/star_hg38 --readFilesIn /home/ssobti/projects/heterogeneity_brian/data/032923_MD231_CRISPRi_CRISPRa_screen/230329_A01102_0587_AH3CY5DSX7_S4_200/BWHG34/398_S5_L001_R2_001.fastq.gz --outFileNamePrefix SL398_R2_all_reads_no_trim &

Aug 29 09:34:11 ..... started STAR run
Aug 29 09:34:11 ..... loading genome
Aug 29 09:34:32 ..... started mapping
Aug 29 09:47:48 ..... finished mapping
Aug 29 09:47:51 ..... started sorting BAM
Aug 29 09:48:40 ..... finished successfully


In [3]:
%%bash
cd /home/ssobti/projects/heterogeneity_brian/output_data/CRISPRi_CRISPRa_cell_culture_screen/CRISPRi_STAR_guide_reads_alignment
nohup htseq-count -n 16 -f bam -s yes -i gene_id SL398_R2_all_reads_no_trimAligned.sortedByCoord.out.bam /home/genomes/hg38/star_hg38/Homo_sapiens.GRCh38.99.gtf > SL398_R2_all_reads_no_trim_HTSeq_count.txt &

[E::idx_find_and_load] Could not retrieve index file for 'SL398_R2_all_reads_no_trimAligned.sortedByCoord.out.bam'
100000 GFF lines processed.
200000 GFF lines processed.
300000 GFF lines processed.
400000 GFF lines processed.
500000 GFF lines processed.
600000 GFF lines processed.
700000 GFF lines processed.
800000 GFF lines processed.
900000 GFF lines processed.
1000000 GFF lines processed.
1100000 GFF lines processed.
1200000 GFF lines processed.
1300000 GFF lines processed.
1400000 GFF lines processed.
1500000 GFF lines processed.
1600000 GFF lines processed.
1700000 GFF lines processed.
1800000 GFF lines processed.
1900000 GFF lines processed.
2000000 GFF lines processed.
2100000 GFF lines processed.
2200000 GFF lines processed.
2300000 GFF lines processed.
2400000 GFF lines processed.
2500000 GFF lines processed.
2600000 GFF lines processed.
2700000 GFF lines processed.
2800000 GFF lines processed.
2900000 GFF lines processed.
2905054 GFF lines processed.
[E::idx_find_and_load] C

In [1]:
## load in bed file gene counts from reads
import pandas as pd
import mygene as mg

path = '/home/ssobti/projects/heterogeneity_brian/output_data/CRISPRi_CRISPRa_cell_culture_screen/CRISPRi_STAR_guide_reads_alignment/'
read_counts = pd.read_csv(path + 'SL398_R2_all_reads_no_trim_HTSeq_count.txt', sep='\t', names = ['gene', 'count'])
read_counts.sort_values(by='count', ascending=False, inplace=True)
read_counts

Unnamed: 0,gene,count
60680,__alignment_not_unique,12301702
60676,__no_feature,6729884
60677,__ambiguous,1042670
21453,ENSG00000211459,165453
1315,ENSG00000075624,109746
...,...,...
29009,ENSG00000229857,0
29010,ENSG00000229858,0
29011,ENSG00000229859,0
29013,ENSG00000229863,0


In [2]:
remove_rows = ['__alignment_not_unique', '__no_feature', '__ambiguous']
read_counts = read_counts[~read_counts['gene'].isin(remove_rows)].reset_index()
read_counts

Unnamed: 0,index,gene,count
0,21453,ENSG00000211459,165453
1,1315,ENSG00000075624,109746
2,21417,ENSG00000210082,88919
3,6598,ENSG00000132507,61790
4,7539,ENSG00000137818,38688
...,...,...,...
60673,29009,ENSG00000229857,0
60674,29010,ENSG00000229858,0
60675,29011,ENSG00000229859,0
60676,29013,ENSG00000229863,0


In [4]:
## convert ENSEMBL IDs to gene symbols
import mygene as mg
mg = mg.MyGeneInfo()
gene_symbols = mg.querymany(read_counts['gene'].tolist(), scopes='ensembl.gene', fields = ['symbol'], species = 'human')
gene_symbols = [gene['symbol'] if 'symbol' in gene.keys() else 'NA' for gene in gene_symbols]

querying 1-1000...

done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
querying 35001-36000...do

In [5]:
## adding gene symbols
read_counts['symbol'] = gene_symbols
read_counts[0:30]

Unnamed: 0,index,gene,count,symbol
0,21453,ENSG00000211459,165453,RNR1
1,1315,ENSG00000075624,109746,ACTB
2,21417,ENSG00000210082,88919,RNR2
3,6598,ENSG00000132507,61790,EIF5A
4,7539,ENSG00000137818,38688,RPLP1
5,14990,ENSG00000181163,34484,NPM1
6,17572,ENSG00000197956,33989,S100A6
7,452,ENSG00000026025,32591,VIM
8,17819,ENSG00000198727,29818,CYTB
9,16805,ENSG00000188846,27429,RPL14
