In [1]:
## Notebook env: seq_trblsht (seq_trblsht kernel)
## This notebook is used to analyze the fraction of reads that map to the guide region in GEM2 of the CRISPRa library


In [2]:
%%bash
cd /home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/STAR_guide_reads_alignment
nohup STAR --outSAMtype BAM SortedByCoordinate --readFilesCommand zcat --runThreadN 16 --genomeDir /home/genomes/hg38/star_hg38 --readFilesIn /home/ssobti/projects/heterogeneity_brian/data/072623_MD231_CRISPRa_repeat_screen/BWHG37/SL425_S4_L004_R2_001.fastq.gz --outFileNamePrefix SL425_R2_all_reads_no_trim &

Aug 26 13:43:06 ..... started STAR run
Aug 26 13:43:06 ..... loading genome
Aug 26 13:43:18 ..... started mapping
Aug 26 14:20:36 ..... finished mapping
Aug 26 14:20:38 ..... started sorting BAM
Aug 26 14:29:16 ..... finished successfully


In [3]:
%%bash
cd /home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/STAR_guide_reads_alignment
nohup htseq-count -n 16 -f bam -s yes -i gene_id SL425_R2_all_reads_no_trimAligned.sortedByCoord.out.bam /home/genomes/hg38/star_hg38/Homo_sapiens.GRCh38.99.gtf > SL425_R2_all_reads_no_trim_HTSeq_count.txt &

[E::idx_find_and_load] Could not retrieve index file for 'SL425_R2_all_reads_no_trimAligned.sortedByCoord.out.bam'
100000 GFF lines processed.
200000 GFF lines processed.
300000 GFF lines processed.
400000 GFF lines processed.
500000 GFF lines processed.
600000 GFF lines processed.
700000 GFF lines processed.
800000 GFF lines processed.
900000 GFF lines processed.
1000000 GFF lines processed.
1100000 GFF lines processed.
1200000 GFF lines processed.
1300000 GFF lines processed.
1400000 GFF lines processed.
1500000 GFF lines processed.
1600000 GFF lines processed.
1700000 GFF lines processed.
1800000 GFF lines processed.
1900000 GFF lines processed.
2000000 GFF lines processed.
2100000 GFF lines processed.
2200000 GFF lines processed.
2300000 GFF lines processed.
2400000 GFF lines processed.
2500000 GFF lines processed.
2600000 GFF lines processed.
2700000 GFF lines processed.
2800000 GFF lines processed.
2900000 GFF lines processed.
2905054 GFF lines processed.
[E::idx_find_and_load] C

In [1]:
## load in bed file gene counts from reads
import pandas as pd
import mygene as mg

path = '/home/ssobti/projects/heterogeneity_brian/output_data/CRISPRa_repeat_screen/low_guides_trblsht/STAR_guide_reads_alignment/'
read_counts = pd.read_csv(path + 'SL425_R2_all_reads_no_trim_HTSeq_count.txt', sep='\t', names = ['gene', 'count'])
read_counts.sort_values(by='count', ascending=False, inplace=True)
read_counts

Unnamed: 0,gene,count
60680,__alignment_not_unique,79208863
60676,__no_feature,67523523
60677,__ambiguous,5524157
21453,ENSG00000211459,1817861
21417,ENSG00000210082,1514655
...,...,...
28740,ENSG00000229453,0
28741,ENSG00000229454,0
28742,ENSG00000229455,0
28744,ENSG00000229457,0


In [2]:
remove_rows = ['__alignment_not_unique', '__no_feature', '__ambiguous']
read_counts = read_counts[~read_counts['gene'].isin(remove_rows)].reset_index()
read_counts

Unnamed: 0,index,gene,count
0,21453,ENSG00000211459,1817861
1,21417,ENSG00000210082,1514655
2,1315,ENSG00000075624,560091
3,21433,ENSG00000210184,344591
4,6598,ENSG00000132507,333662
...,...,...,...
60673,28740,ENSG00000229453,0
60674,28741,ENSG00000229454,0
60675,28742,ENSG00000229455,0
60676,28744,ENSG00000229457,0


In [8]:
sum(read_counts['count'])
read_counts.sum('count')
## 36M

36460879

In [3]:
## convert ENSEMBL IDs to gene symbols
import mygene as mg
mg = mg.MyGeneInfo()
gene_symbols = mg.querymany(read_counts['gene'].tolist(), scopes='ensembl.gene', fields = ['symbol'], species = 'human')
gene_symbols = [gene['symbol'] if 'symbol' in gene.keys() else 'NA' for gene in gene_symbols]

querying 1-1000...

done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
querying 35001-36000...do

In [4]:
## adding gene symbols
read_counts['symbol'] = gene_symbols
read_counts[0:30]

Unnamed: 0,index,gene,count,symbol
0,21453,ENSG00000211459,1817861,RNR1
1,21417,ENSG00000210082,1514655,RNR2
2,1315,ENSG00000075624,560091,ACTB
3,21433,ENSG00000210184,344591,TRNS2
4,6598,ENSG00000132507,333662,EIF5A
5,17941,ENSG00000198938,299276,COX3
6,17572,ENSG00000197956,292381,S100A6
7,7539,ENSG00000137818,288697,RPLP1
8,39796,ENSG00000251562,244494,MALAT1
9,9263,ENSG00000149925,239095,ALDOA
