# 1. load required packages

In [5]:
%run "C:\Users\shiwei\Documents\ImageAnalysis3\required_files\Startup_py3.py"
sys.path.append(r"C:\Users\shiwei\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

import h5py
from ImageAnalysis3.classes import _allowed_kwds
import ast

import pandas as pd

# library design specific tools
from ImageAnalysis3.library_tools import LibraryDesigner as ld
from ImageAnalysis3.library_tools import LibraryTools as lt
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

103308


# 2. Load transcriptome

In [6]:
## Define the genome reference path
# mouse GRCm38 genome
reference_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Genomes\mouse\GRCm38_ensembl'
transcriptome_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Genomes\mouse\GRCm38_ensembl\Transcriptome'
# Library directories
pool_folder = r'L:\Shiwei\DNA_MERFISH_analysis\SMARTer_nuclei_MOp'

cdna_fasta_file = os.path.join(transcriptome_folder,'Mus_musculus.GRCm38.cdna.all.fa')
ncrna_fasta_file = os.path.join(transcriptome_folder,'Mus_musculus.GRCm38.ncrna.fa')

In [7]:
def load_transcriptome_fasta_into_dataframe(fasta_file):
    d = {'transcript_id':[], 'seq_type':[], 'location':[], 'gene_id':[],
        'gene_biotype':[], 'transcript_biotype':[], 'gene_symbol':[],
        'description':[], 'seq_length':[], 'sequence':[]}
    
    try:
        for record in SeqIO.parse(fasta_file, 'fasta'):
            split_rd = record.description.split()            
            d['transcript_id'].append(split_rd[0].split('.')[0])
            d['seq_type'].append(split_rd[1])
            d['location'].append(split_rd[2])
            d['gene_id'].append(split_rd[3][5:].split('.')[0])
            d['gene_biotype'].append(split_rd[4][13:])
            d['transcript_biotype'].append(split_rd[5][19:])
            if len(split_rd) > 6:
                d['gene_symbol'].append(split_rd[6][12:])
            else:
                d['gene_symbol'].append(pd.NA)
            if len(split_rd) > 7:
                d['description'].append(' '.join(split_rd[7:])[12:])
            else:
                d['description'].append(pd.NA)
            
            d['seq_length'].append(len(str(record.seq)))
            d['sequence'].append(str(record.seq))
    
    except:
        print(split_rd)
        raise
    
    df = pd.DataFrame.from_dict(d)
    df = df.set_index('transcript_id')
    return df

In [8]:
# Load the transcriptome fasta
transcriptome_cdna = load_transcriptome_fasta_into_dataframe(cdna_fasta_file)
transcriptome_ncrna = load_transcriptome_fasta_into_dataframe(ncrna_fasta_file)
transcriptome = pd.concat((transcriptome_cdna, transcriptome_ncrna), axis=0) 

In [9]:
print(len(transcriptome))
transcriptome.head()

141450


Unnamed: 0_level_0,seq_type,location,gene_id,gene_biotype,transcript_biotype,gene_symbol,description,seq_length,sequence
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSMUST00000177564,cdna,chromosome:GRCm38:14:54122226:54122241:1,ENSMUSG00000096176,TR_D_gene,TR_D_gene,Trdd2,T cell receptor delta diversity 2 [Source:MGI ...,16,ATCGGAGGGATACGAG
ENSMUST00000196221,cdna,chromosome:GRCm38:14:54113468:54113476:1,ENSMUSG00000096749,TR_D_gene,TR_D_gene,Trdd1,T cell receptor delta diversity 1 [Source:MGI ...,9,ATGGCATAT
ENSMUST00000179664,cdna,chromosome:GRCm38:14:54113468:54113478:1,ENSMUSG00000096749,TR_D_gene,processed_transcript,Trdd1,T cell receptor delta diversity 1 [Source:MGI ...,11,ATGGCATATCA
ENSMUST00000178537,cdna,chromosome:GRCm38:6:41533201:41533212:1,ENSMUSG00000095668,TR_D_gene,TR_D_gene,Trbd1,"T cell receptor beta, D region 1 [Source:MGI S...",12,GGGACAGGGGGC
ENSMUST00000178862,cdna,chromosome:GRCm38:6:41542163:41542176:1,ENSMUSG00000094569,TR_D_gene,TR_D_gene,Trbd2,"T cell receptor beta, D region 2 [Source:MGI S...",14,GGGACTGGGGGGGC


# 3. Process MOp genes (genes obtained from smart_seq)

In [10]:
import scanpy as sc
scRNA_folder =r'L:\Shiwei\DNA_MERFISH_analysis\SMARTer_nuclei_MOp'
# load from here for saved h5ad
adata = sc.read(os.path.join(scRNA_folder,r'MOp_smart_sn_raw.h5ad'))

Mop_genes = adata.var.index.tolist()
len(Mop_genes)

33262

In [29]:
# load gene from other notebook
# retrieve genes from the MOp_smart_sn_labeled.h5ad 

#%store -r Mop_genes

In [11]:
len(Mop_genes)

33262

In [12]:
_strands = [1,1,1,1,-1]
import statistics
from statistics import mode
mode(_strands)

1

In [13]:
from tqdm.notebook import tqdm

In [15]:
import statistics
from statistics import mode


# find corresponding gene info from the reference dict
Mop_gene_dict ={}
Mop_gene_dict['gene']=[]
Mop_gene_dict['chr']=[]
Mop_gene_dict['start']=[]
Mop_gene_dict['end']=[]
Mop_gene_dict['gene_biotype']=[]
Mop_gene_dict['coding_strand']=[]

ref_genes = np.unique(transcriptome['gene_symbol'].tolist())


not_found=0
for _gene in tqdm(Mop_genes[:]):
    Mop_gene_dict['gene'].append(_gene)
    
    if _gene in ref_genes:
        _gene_ref_df = transcriptome[transcriptome['gene_symbol']==_gene]
        _location_list = _gene_ref_df['location'].tolist()
        # check format
        if 'chromosome:GRCm38:' == _location_list[0][:18]:
            _start_list = [int(_loc.split('chromosome:GRCm38:')[-1].split(':')[1]) for _loc in _location_list]
            _end_list = [int(_loc.split('chromosome:GRCm38:')[-1].split(':')[2]) for _loc in _location_list]
            Mop_gene_dict['start'].append(np.min(_start_list))
            Mop_gene_dict['end'].append(np.max(_end_list))
            Mop_gene_dict['chr'].append(_location_list[0].split('chromosome:GRCm38:')[-1].split(':')[0])
            # add transcript strand info; use most commen strand
            _strands = [int(_loc.split(':')[-1]) for _loc in _location_list]
            Mop_gene_dict['coding_strand'].append(mode(_strands))
            
        else:
            print (f'Wrong format for {_gene}. Check the result.')
            Mop_gene_dict['start'].append(pd.NA)
            Mop_gene_dict['end'].append(pd.NA)
            Mop_gene_dict['chr'].append(pd.NA)
            Mop_gene_dict['coding_strand'].append(pd.NA)
            
            
        _biotype_list = np.unique(_gene_ref_df['gene_biotype'].tolist())
        if len(_biotype_list)==1:
            Mop_gene_dict['gene_biotype'].append(_biotype_list[0])
        else:
            print(f'Multiple gene biotype exists for {_gene}. Merge the result.')
            merged_biotype = ';'.join(_biotype_list)
            Mop_gene_dict['gene_biotype'].append(merged_biotype)
            
    else:
        print(f'The gene {_gene} not found in the database. Append empty values.')
        Mop_gene_dict['start'].append(pd.NA)
        Mop_gene_dict['end'].append(pd.NA)
        Mop_gene_dict['chr'].append(pd.NA)
        Mop_gene_dict['gene_biotype'].append(pd.NA)
        Mop_gene_dict['coding_strand'].append(pd.NA)
        
        not_found+=1

print (f'There are {not_found} genes not found and were skipped from this query.')            

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=33262.0), HTML(value='')))

The gene 0610007P14Rik not found in the database. Append empty values.
The gene 0610009O20Rik not found in the database. Append empty values.
The gene 0610010B08Rik not found in the database. Append empty values.
The gene 0610011F06Rik not found in the database. Append empty values.
The gene 0610037L13Rik not found in the database. Append empty values.
The gene 0610039H22Rik not found in the database. Append empty values.
The gene 1010001N08Rik not found in the database. Append empty values.
The gene 1110001J03Rik not found in the database. Append empty values.
The gene 1110004E09Rik not found in the database. Append empty values.
The gene 1110007C09Rik not found in the database. Append empty values.
The gene 1110008F13Rik not found in the database. Append empty values.
The gene 1110008L16Rik not found in the database. Append empty values.
The gene 1110034G24Rik not found in the database. Append empty values.
The gene 1110037F02Rik not found in the database. Append empty values.
The ge

The gene 2310003H01Rik not found in the database. Append empty values.
The gene 2310009A05Rik not found in the database. Append empty values.
The gene 2310014L17Rik not found in the database. Append empty values.
The gene 2310034P14Rik not found in the database. Append empty values.
The gene 2310035C23Rik not found in the database. Append empty values.
The gene 2310036O22Rik not found in the database. Append empty values.
The gene 2310045N01Rik not found in the database. Append empty values.
The gene 2310047M10Rik not found in the database. Append empty values.
The gene 2310061J03Rik not found in the database. Append empty values.
The gene 2310067B10Rik not found in the database. Append empty values.
The gene 2310067E19Rik not found in the database. Append empty values.
The gene 2310069G16Rik not found in the database. Append empty values.
The gene 2310075K07Rik not found in the database. Append empty values.
The gene 2410004N09Rik not found in the database. Append empty values.
The ge

The gene 4930543D07Rik not found in the database. Append empty values.
The gene 4930552P12Rik not found in the database. Append empty values.
The gene 4930553C11Rik not found in the database. Append empty values.
The gene 4930557B15Rik not found in the database. Append empty values.
The gene 4930563D23Rik not found in the database. Append empty values.
Multiple gene biotype exists for 4930563F08Rik. Merge the result.
The gene 4930570G05Rik not found in the database. Append empty values.
The gene 4930572J10Rik not found in the database. Append empty values.
The gene 4930573O16Rik not found in the database. Append empty values.
The gene 4930577N17Rik not found in the database. Append empty values.
The gene 4930578C19Rik not found in the database. Append empty values.
The gene 4930594C11Rik not found in the database. Append empty values.
Multiple gene biotype exists for 4930594M22Rik. Merge the result.
The gene 4931402G19Rik not found in the database. Append empty values.
The gene 4931403

The gene A630075F10Rik not found in the database. Append empty values.
The gene A630077J23Rik not found in the database. Append empty values.
The gene A630081J09Rik not found in the database. Append empty values.
Multiple gene biotype exists for A730006G06Rik. Merge the result.
The gene A730013G03Rik not found in the database. Append empty values.
The gene A730017C20Rik not found in the database. Append empty values.
The gene A730067D02Rik not found in the database. Append empty values.
The gene A730089K16Rik not found in the database. Append empty values.
The gene A730090H04Rik not found in the database. Append empty values.
The gene A730098P11Rik not found in the database. Append empty values.
The gene A830010M20Rik not found in the database. Append empty values.
The gene A830021M18Rik not found in the database. Append empty values.
The gene A830080D01Rik not found in the database. Append empty values.
The gene A930011G23Rik not found in the database. Append empty values.
The gene A9

The gene Bmp4-ps not found in the database. Append empty values.
The gene Bre not found in the database. Append empty values.
The gene Btg1-ps2 not found in the database. Append empty values.
The gene Bzrap1 not found in the database. Append empty values.
The gene C030016D13Rik not found in the database. Append empty values.
The gene C030023E24Rik not found in the database. Append empty values.
The gene C030039L03Rik not found in the database. Append empty values.
The gene C130023O10Rik not found in the database. Append empty values.
The gene C130030K03Rik not found in the database. Append empty values.
The gene C130060K24Rik not found in the database. Append empty values.
The gene C230004F18Rik not found in the database. Append empty values.
The gene C230029M16 not found in the database. Append empty values.
The gene C230052I12Rik not found in the database. Append empty values.
The gene C230091D08Rik not found in the database. Append empty values.
The gene C330006A16Rik not found in t

The gene Diap1 not found in the database. Append empty values.
The gene Diap2 not found in the database. Append empty values.
The gene Diap3 not found in the database. Append empty values.
The gene Diexf not found in the database. Append empty values.
The gene Dirc2 not found in the database. Append empty values.
Multiple gene biotype exists for Dlx6os1. Merge the result.
The gene Dlx6os2 not found in the database. Append empty values.
The gene Dnmt3b-ps1 not found in the database. Append empty values.
The gene Dopey1 not found in the database. Append empty values.
The gene Dopey2 not found in the database. Append empty values.
The gene Dos not found in the database. Append empty values.
The gene Dpcd not found in the database. Append empty values.
The gene Dpcr1 not found in the database. Append empty values.
The gene Dscr3 not found in the database. Append empty values.
The gene Dyx1c1 not found in the database. Append empty values.
The gene E030003E18Rik not found in the database. A

The gene Figf not found in the database. Append empty values.
The gene Fkbp1a-ps4 not found in the database. Append empty values.
The gene Ftl2 not found in the database. Append empty values.
The gene Ftsj2 not found in the database. Append empty values.
The gene Fuk not found in the database. Append empty values.
The gene G370120E05Rik not found in the database. Append empty values.
The gene G430049J08Rik not found in the database. Append empty values.
The gene G630025P09Rik not found in the database. Append empty values.
The gene G630071F17Rik not found in the database. Append empty values.
The gene G630090E17Rik not found in the database. Append empty values.
The gene G6b not found in the database. Append empty values.
The gene Garem not found in the database. Append empty values.
The gene Gareml not found in the database. Append empty values.
The gene Gatsl2 not found in the database. Append empty values.
The gene Gatsl3 not found in the database. Append empty values.
The gene Gbas

The gene Gm166 not found in the database. Append empty values.
The gene Gm1661 not found in the database. Append empty values.
The gene Gm16619 not found in the database. Append empty values.
The gene Gm16686 not found in the database. Append empty values.
The gene Gm16702 not found in the database. Append empty values.
The gene Gm16880 not found in the database. Append empty values.
The gene Gm16907 not found in the database. Append empty values.
The gene Gm17209 not found in the database. Append empty values.
The gene Gm17250 not found in the database. Append empty values.
The gene Gm17252 not found in the database. Append empty values.
The gene Gm17257 not found in the database. Append empty values.
The gene Gm17296 not found in the database. Append empty values.
The gene Gm17352 not found in the database. Append empty values.
The gene Gm17353 not found in the database. Append empty values.
The gene Gm17365 not found in the database. Append empty values.
The gene Gm17388 not found i

The gene Gm18418 not found in the database. Append empty values.
The gene Gm18434 not found in the database. Append empty values.
The gene Gm18436 not found in the database. Append empty values.
The gene Gm18446 not found in the database. Append empty values.
The gene Gm18454 not found in the database. Append empty values.
The gene Gm18472 not found in the database. Append empty values.
The gene Gm18474 not found in the database. Append empty values.
The gene Gm18479 not found in the database. Append empty values.
The gene Gm18483 not found in the database. Append empty values.
The gene Gm18490 not found in the database. Append empty values.
The gene Gm18511 not found in the database. Append empty values.
The gene Gm18512 not found in the database. Append empty values.
The gene Gm18516 not found in the database. Append empty values.
The gene Gm18535 not found in the database. Append empty values.
The gene Gm18542 not found in the database. Append empty values.
The gene Gm18545 not foun

The gene Gm19167 not found in the database. Append empty values.
The gene Gm19191 not found in the database. Append empty values.
The gene Gm19192 not found in the database. Append empty values.
The gene Gm19204 not found in the database. Append empty values.
The gene Gm19205 not found in the database. Append empty values.
The gene Gm19229 not found in the database. Append empty values.
The gene Gm19230 not found in the database. Append empty values.
The gene Gm19234 not found in the database. Append empty values.
The gene Gm19240 not found in the database. Append empty values.
The gene Gm19245 not found in the database. Append empty values.
The gene Gm19247 not found in the database. Append empty values.
The gene Gm19249 not found in the database. Append empty values.
The gene Gm19250 not found in the database. Append empty values.
The gene Gm19251 not found in the database. Append empty values.
The gene Gm19255 not found in the database. Append empty values.
The gene Gm19258 not foun

The gene Gm2773 not found in the database. Append empty values.
The gene Gm2815 not found in the database. Append empty values.
The gene Gm2836 not found in the database. Append empty values.
The gene Gm2845 not found in the database. Append empty values.
The gene Gm2848 not found in the database. Append empty values.
Multiple gene biotype exists for Gm28710. Merge the result.
The gene Gm2904 not found in the database. Append empty values.
The gene Gm29595 not found in the database. Append empty values.
The gene Gm29673 not found in the database. Append empty values.
The gene Gm29679 not found in the database. Append empty values.
Multiple gene biotype exists for Gm29681. Merge the result.
The gene Gm29690 not found in the database. Append empty values.
The gene Gm29692 not found in the database. Append empty values.
The gene Gm29693 not found in the database. Append empty values.
The gene Gm29694 not found in the database. Append empty values.
The gene Gm2970 not found in the database

The gene Gm30086 not found in the database. Append empty values.
The gene Gm30089 not found in the database. Append empty values.
The gene Gm30090 not found in the database. Append empty values.
The gene Gm30092 not found in the database. Append empty values.
The gene Gm30101 not found in the database. Append empty values.
The gene Gm30102 not found in the database. Append empty values.
The gene Gm30103 not found in the database. Append empty values.
The gene Gm30106 not found in the database. Append empty values.
The gene Gm30107 not found in the database. Append empty values.
The gene Gm30110 not found in the database. Append empty values.
The gene Gm30112 not found in the database. Append empty values.
The gene Gm30113 not found in the database. Append empty values.
The gene Gm30116 not found in the database. Append empty values.
The gene Gm30118 not found in the database. Append empty values.
The gene Gm30119 not found in the database. Append empty values.
The gene Gm30120 not foun

The gene Gm30529 not found in the database. Append empty values.
The gene Gm30532 not found in the database. Append empty values.
The gene Gm30536 not found in the database. Append empty values.
The gene Gm30539 not found in the database. Append empty values.
The gene Gm30543 not found in the database. Append empty values.
The gene Gm30544 not found in the database. Append empty values.
The gene Gm30548 not found in the database. Append empty values.
The gene Gm30549 not found in the database. Append empty values.
The gene Gm30550 not found in the database. Append empty values.
The gene Gm30552 not found in the database. Append empty values.
The gene Gm30553 not found in the database. Append empty values.
The gene Gm30554 not found in the database. Append empty values.
The gene Gm30561 not found in the database. Append empty values.
The gene Gm30565 not found in the database. Append empty values.
The gene Gm30569 not found in the database. Append empty values.
The gene Gm30572 not foun

The gene Gm30935 not found in the database. Append empty values.
The gene Gm30936 not found in the database. Append empty values.
The gene Gm30939 not found in the database. Append empty values.
The gene Gm30940 not found in the database. Append empty values.
The gene Gm30942 not found in the database. Append empty values.
The gene Gm30943 not found in the database. Append empty values.
The gene Gm30944 not found in the database. Append empty values.
The gene Gm30949 not found in the database. Append empty values.
The gene Gm30953 not found in the database. Append empty values.
The gene Gm30956 not found in the database. Append empty values.
The gene Gm30958 not found in the database. Append empty values.
The gene Gm30959 not found in the database. Append empty values.
The gene Gm30963 not found in the database. Append empty values.
The gene Gm30967 not found in the database. Append empty values.
The gene Gm30972 not found in the database. Append empty values.
The gene Gm30973 not foun

The gene Gm31288 not found in the database. Append empty values.
The gene Gm31289 not found in the database. Append empty values.
The gene Gm31290 not found in the database. Append empty values.
The gene Gm31291 not found in the database. Append empty values.
The gene Gm31292 not found in the database. Append empty values.
The gene Gm31295 not found in the database. Append empty values.
The gene Gm31298 not found in the database. Append empty values.
The gene Gm31301 not found in the database. Append empty values.
The gene Gm31302 not found in the database. Append empty values.
The gene Gm31308 not found in the database. Append empty values.
The gene Gm31309 not found in the database. Append empty values.
The gene Gm31313 not found in the database. Append empty values.
The gene Gm31317 not found in the database. Append empty values.
The gene Gm31318 not found in the database. Append empty values.
The gene Gm31319 not found in the database. Append empty values.
The gene Gm31320 not foun

The gene Gm31646 not found in the database. Append empty values.
The gene Gm31647 not found in the database. Append empty values.
The gene Gm31648 not found in the database. Append empty values.
The gene Gm31649 not found in the database. Append empty values.
The gene Gm31650 not found in the database. Append empty values.
The gene Gm31652 not found in the database. Append empty values.
The gene Gm31655 not found in the database. Append empty values.
The gene Gm31656 not found in the database. Append empty values.
The gene Gm31657 not found in the database. Append empty values.
The gene Gm31660 not found in the database. Append empty values.
The gene Gm31661 not found in the database. Append empty values.
The gene Gm31665 not found in the database. Append empty values.
The gene Gm31669 not found in the database. Append empty values.
The gene Gm31672 not found in the database. Append empty values.
The gene Gm31674 not found in the database. Append empty values.
The gene Gm31676 not foun

The gene Gm32006 not found in the database. Append empty values.
The gene Gm32007 not found in the database. Append empty values.
The gene Gm32008 not found in the database. Append empty values.
The gene Gm32009 not found in the database. Append empty values.
The gene Gm32010 not found in the database. Append empty values.
The gene Gm32011 not found in the database. Append empty values.
The gene Gm32013 not found in the database. Append empty values.
The gene Gm32015 not found in the database. Append empty values.
The gene Gm32016 not found in the database. Append empty values.
The gene Gm32018 not found in the database. Append empty values.
The gene Gm32024 not found in the database. Append empty values.
The gene Gm32026 not found in the database. Append empty values.
The gene Gm32028 not found in the database. Append empty values.
The gene Gm32035 not found in the database. Append empty values.
The gene Gm32039 not found in the database. Append empty values.
The gene Gm32040 not foun

The gene Gm32413 not found in the database. Append empty values.
The gene Gm32420 not found in the database. Append empty values.
The gene Gm32428 not found in the database. Append empty values.
The gene Gm32429 not found in the database. Append empty values.
The gene Gm32430 not found in the database. Append empty values.
The gene Gm32434 not found in the database. Append empty values.
The gene Gm32435 not found in the database. Append empty values.
The gene Gm32436 not found in the database. Append empty values.
The gene Gm32437 not found in the database. Append empty values.
The gene Gm3244 not found in the database. Append empty values.
The gene Gm32440 not found in the database. Append empty values.
The gene Gm32444 not found in the database. Append empty values.
The gene Gm32448 not found in the database. Append empty values.
The gene Gm32449 not found in the database. Append empty values.
The gene Gm32450 not found in the database. Append empty values.
The gene Gm32457 not found

The gene Gm32760 not found in the database. Append empty values.
The gene Gm32762 not found in the database. Append empty values.
The gene Gm32768 not found in the database. Append empty values.
The gene Gm32770 not found in the database. Append empty values.
The gene Gm32771 not found in the database. Append empty values.
The gene Gm32773 not found in the database. Append empty values.
The gene Gm32776 not found in the database. Append empty values.
The gene Gm32777 not found in the database. Append empty values.
The gene Gm32778 not found in the database. Append empty values.
The gene Gm32788 not found in the database. Append empty values.
The gene Gm32790 not found in the database. Append empty values.
The gene Gm32791 not found in the database. Append empty values.
The gene Gm32792 not found in the database. Append empty values.
The gene Gm32793 not found in the database. Append empty values.
The gene Gm32794 not found in the database. Append empty values.
The gene Gm32796 not foun

The gene Gm33169 not found in the database. Append empty values.
The gene Gm33173 not found in the database. Append empty values.
The gene Gm33176 not found in the database. Append empty values.
The gene Gm33182 not found in the database. Append empty values.
The gene Gm33185 not found in the database. Append empty values.
The gene Gm33187 not found in the database. Append empty values.
The gene Gm33188 not found in the database. Append empty values.
The gene Gm33189 not found in the database. Append empty values.
The gene Gm33190 not found in the database. Append empty values.
The gene Gm33197 not found in the database. Append empty values.
The gene Gm33205 not found in the database. Append empty values.
The gene Gm33207 not found in the database. Append empty values.
The gene Gm33210 not found in the database. Append empty values.
The gene Gm33211 not found in the database. Append empty values.
The gene Gm33214 not found in the database. Append empty values.
The gene Gm33217 not foun

The gene Gm33534 not found in the database. Append empty values.
The gene Gm33535 not found in the database. Append empty values.
The gene Gm33536 not found in the database. Append empty values.
The gene Gm33539 not found in the database. Append empty values.
The gene Gm33541 not found in the database. Append empty values.
The gene Gm33542 not found in the database. Append empty values.
The gene Gm33548 not found in the database. Append empty values.
The gene Gm33551 not found in the database. Append empty values.
The gene Gm33553 not found in the database. Append empty values.
The gene Gm33554 not found in the database. Append empty values.
The gene Gm33555 not found in the database. Append empty values.
The gene Gm33557 not found in the database. Append empty values.
The gene Gm33560 not found in the database. Append empty values.
The gene Gm33562 not found in the database. Append empty values.
The gene Gm33563 not found in the database. Append empty values.
The gene Gm33564 not foun

The gene Gm33941 not found in the database. Append empty values.
The gene Gm33944 not found in the database. Append empty values.
The gene Gm33945 not found in the database. Append empty values.
The gene Gm33946 not found in the database. Append empty values.
The gene Gm33947 not found in the database. Append empty values.
The gene Gm33950 not found in the database. Append empty values.
The gene Gm33951 not found in the database. Append empty values.
The gene Gm33953 not found in the database. Append empty values.
The gene Gm33955 not found in the database. Append empty values.
The gene Gm33959 not found in the database. Append empty values.
The gene Gm33963 not found in the database. Append empty values.
The gene Gm33964 not found in the database. Append empty values.
The gene Gm33971 not found in the database. Append empty values.
The gene Gm33977 not found in the database. Append empty values.
The gene Gm33983 not found in the database. Append empty values.
The gene Gm33985 not foun

The gene Gm34285 not found in the database. Append empty values.
The gene Gm34286 not found in the database. Append empty values.
The gene Gm34287 not found in the database. Append empty values.
The gene Gm34289 not found in the database. Append empty values.
The gene Gm34290 not found in the database. Append empty values.
The gene Gm34291 not found in the database. Append empty values.
The gene Gm34292 not found in the database. Append empty values.
The gene Gm34296 not found in the database. Append empty values.
The gene Gm34303 not found in the database. Append empty values.
The gene Gm34305 not found in the database. Append empty values.
The gene Gm34306 not found in the database. Append empty values.
The gene Gm34311 not found in the database. Append empty values.
The gene Gm34313 not found in the database. Append empty values.
The gene Gm34315 not found in the database. Append empty values.
The gene Gm34316 not found in the database. Append empty values.
The gene Gm34317 not foun

The gene Gm34661 not found in the database. Append empty values.
The gene Gm34666 not found in the database. Append empty values.
The gene Gm34669 not found in the database. Append empty values.
The gene Gm34676 not found in the database. Append empty values.
The gene Gm34679 not found in the database. Append empty values.
The gene Gm34682 not found in the database. Append empty values.
The gene Gm34686 not found in the database. Append empty values.
The gene Gm34687 not found in the database. Append empty values.
The gene Gm34690 not found in the database. Append empty values.
The gene Gm34695 not found in the database. Append empty values.
The gene Gm34696 not found in the database. Append empty values.
The gene Gm34699 not found in the database. Append empty values.
The gene Gm34700 not found in the database. Append empty values.
The gene Gm34702 not found in the database. Append empty values.
The gene Gm34703 not found in the database. Append empty values.
The gene Gm34704 not foun

The gene Gm35001 not found in the database. Append empty values.
The gene Gm35002 not found in the database. Append empty values.
The gene Gm35015 not found in the database. Append empty values.
The gene Gm35017 not found in the database. Append empty values.
The gene Gm35018 not found in the database. Append empty values.
Multiple gene biotype exists for Gm35025. Merge the result.
The gene Gm35029 not found in the database. Append empty values.
The gene Gm35032 not found in the database. Append empty values.
The gene Gm35033 not found in the database. Append empty values.
The gene Gm35034 not found in the database. Append empty values.
Multiple gene biotype exists for Gm35037. Merge the result.
The gene Gm35039 not found in the database. Append empty values.
The gene Gm35043 not found in the database. Append empty values.
The gene Gm35048 not found in the database. Append empty values.
The gene Gm35049 not found in the database. Append empty values.
The gene Gm35051 not found in the d

The gene Gm35444 not found in the database. Append empty values.
The gene Gm35445 not found in the database. Append empty values.
The gene Gm35446 not found in the database. Append empty values.
The gene Gm35449 not found in the database. Append empty values.
The gene Gm35456 not found in the database. Append empty values.
The gene Gm35458 not found in the database. Append empty values.
The gene Gm35462 not found in the database. Append empty values.
The gene Gm35463 not found in the database. Append empty values.
The gene Gm35464 not found in the database. Append empty values.
The gene Gm35465 not found in the database. Append empty values.
The gene Gm35466 not found in the database. Append empty values.
The gene Gm35468 not found in the database. Append empty values.
The gene Gm35469 not found in the database. Append empty values.
The gene Gm35470 not found in the database. Append empty values.
The gene Gm35471 not found in the database. Append empty values.
The gene Gm35473 not foun

The gene Gm35894 not found in the database. Append empty values.
The gene Gm35895 not found in the database. Append empty values.
The gene Gm35899 not found in the database. Append empty values.
The gene Gm35902 not found in the database. Append empty values.
The gene Gm35906 not found in the database. Append empty values.
The gene Gm35907 not found in the database. Append empty values.
The gene Gm35908 not found in the database. Append empty values.
The gene Gm3591 not found in the database. Append empty values.
The gene Gm35911 not found in the database. Append empty values.
The gene Gm35913 not found in the database. Append empty values.
The gene Gm35915 not found in the database. Append empty values.
The gene Gm35916 not found in the database. Append empty values.
The gene Gm35919 not found in the database. Append empty values.
The gene Gm35925 not found in the database. Append empty values.
The gene Gm35930 not found in the database. Append empty values.
The gene Gm35932 not found

The gene Gm36281 not found in the database. Append empty values.
The gene Gm36284 not found in the database. Append empty values.
The gene Gm36286 not found in the database. Append empty values.
The gene Gm36289 not found in the database. Append empty values.
The gene Gm36292 not found in the database. Append empty values.
The gene Gm36297 not found in the database. Append empty values.
The gene Gm36299 not found in the database. Append empty values.
The gene Gm36300 not found in the database. Append empty values.
The gene Gm36303 not found in the database. Append empty values.
The gene Gm36304 not found in the database. Append empty values.
The gene Gm36305 not found in the database. Append empty values.
The gene Gm36311 not found in the database. Append empty values.
The gene Gm36313 not found in the database. Append empty values.
The gene Gm36316 not found in the database. Append empty values.
The gene Gm36317 not found in the database. Append empty values.
The gene Gm36320 not foun

The gene Gm36610 not found in the database. Append empty values.
The gene Gm36612 not found in the database. Append empty values.
The gene Gm36614 not found in the database. Append empty values.
The gene Gm36616 not found in the database. Append empty values.
The gene Gm36619 not found in the database. Append empty values.
The gene Gm36620 not found in the database. Append empty values.
The gene Gm36623 not found in the database. Append empty values.
The gene Gm36629 not found in the database. Append empty values.
The gene Gm36631 not found in the database. Append empty values.
The gene Gm36648 not found in the database. Append empty values.
The gene Gm36649 not found in the database. Append empty values.
The gene Gm36651 not found in the database. Append empty values.
The gene Gm36652 not found in the database. Append empty values.
The gene Gm36655 not found in the database. Append empty values.
The gene Gm36656 not found in the database. Append empty values.
The gene Gm36658 not foun

The gene Gm4238 not found in the database. Append empty values.
The gene Gm4262 not found in the database. Append empty values.
The gene Gm4274 not found in the database. Append empty values.
The gene Gm4311 not found in the database. Append empty values.
The gene Gm4325 not found in the database. Append empty values.
The gene Gm4371 not found in the database. Append empty values.
The gene Gm4458 not found in the database. Append empty values.
The gene Gm4461 not found in the database. Append empty values.
The gene Gm4466 not found in the database. Append empty values.
The gene Gm4477 not found in the database. Append empty values.
The gene Gm4521 not found in the database. Append empty values.
The gene Gm4532 not found in the database. Append empty values.
The gene Gm4549 not found in the database. Append empty values.
The gene Gm4581 not found in the database. Append empty values.
The gene Gm4604 not found in the database. Append empty values.
The gene Gm4633 not found in the databas

Multiple gene biotype exists for Gm6729. Merge the result.
The gene Gm6731 not found in the database. Append empty values.
The gene Gm6747 not found in the database. Append empty values.
The gene Gm6765 not found in the database. Append empty values.
The gene Gm6768 not found in the database. Append empty values.
The gene Gm6792 not found in the database. Append empty values.
The gene Gm6809 not found in the database. Append empty values.
The gene Gm684 not found in the database. Append empty values.
The gene Gm6846 not found in the database. Append empty values.
The gene Gm6860 not found in the database. Append empty values.
The gene Gm6904 not found in the database. Append empty values.
The gene Gm6910 not found in the database. Append empty values.
The gene Gm6917 not found in the database. Append empty values.
The gene Gm6936 not found in the database. Append empty values.
The gene Gm694 not found in the database. Append empty values.
The gene Gm6958 not found in the database. Appe

The gene Gm9534 not found in the database. Append empty values.
The gene Gm9538 not found in the database. Append empty values.
The gene Gm9546 not found in the database. Append empty values.
The gene Gm9548 not found in the database. Append empty values.
The gene Gm9551 not found in the database. Append empty values.
The gene Gm9557 not found in the database. Append empty values.
The gene Gm9564 not found in the database. Append empty values.
The gene Gm9567 not found in the database. Append empty values.
The gene Gm9612 not found in the database. Append empty values.
The gene Gm9620 not found in the database. Append empty values.
The gene Gm9631 not found in the database. Append empty values.
The gene Gm9683 not found in the database. Append empty values.
The gene Gm9734 not found in the database. Append empty values.
The gene Gm9763 not found in the database. Append empty values.
The gene Gm9769 not found in the database. Append empty values.
The gene Gm9770 not found in the databas

The gene Hmha1 not found in the database. Append empty values.
The gene Hn1 not found in the database. Append empty values.
The gene Hn1l not found in the database. Append empty values.
The gene Hpvc-ps not found in the database. Append empty values.
The gene Hrasls not found in the database. Append empty values.
The gene Hrsp12 not found in the database. Append empty values.
The gene I830012O16Rik not found in the database. Append empty values.
The gene I830127L07Rik not found in the database. Append empty values.
The gene Ick not found in the database. Append empty values.
The gene Ict1 not found in the database. Append empty values.
Multiple gene biotype exists for Ifi30. Merge the result.
The gene Ifltd1 not found in the database. Append empty values.
The gene Igh not found in the database. Append empty values.
The gene Igj not found in the database. Append empty values.
The gene Igk not found in the database. Append empty values.
The gene Igl not found in the database. Append empt

The gene LOC102634018 not found in the database. Append empty values.
The gene LOC102634026 not found in the database. Append empty values.
The gene LOC102634042 not found in the database. Append empty values.
The gene LOC102634048 not found in the database. Append empty values.
The gene LOC102634058 not found in the database. Append empty values.
The gene LOC102634065 not found in the database. Append empty values.
The gene LOC102634072 not found in the database. Append empty values.
The gene LOC102634078 not found in the database. Append empty values.
The gene LOC102634085 not found in the database. Append empty values.
The gene LOC102634090 not found in the database. Append empty values.
The gene LOC102634097 not found in the database. Append empty values.
The gene LOC102634100 not found in the database. Append empty values.
The gene LOC102634128 not found in the database. Append empty values.
The gene LOC102634132 not found in the database. Append empty values.
The gene LOC10263413

The gene LOC102638700 not found in the database. Append empty values.
The gene LOC102638745 not found in the database. Append empty values.
The gene LOC102638754 not found in the database. Append empty values.
The gene LOC102638773 not found in the database. Append empty values.
The gene LOC102638775 not found in the database. Append empty values.
The gene LOC102638785 not found in the database. Append empty values.
The gene LOC102638848 not found in the database. Append empty values.
The gene LOC102638850 not found in the database. Append empty values.
The gene LOC102638878 not found in the database. Append empty values.
The gene LOC102638884 not found in the database. Append empty values.
The gene LOC102638888 not found in the database. Append empty values.
The gene LOC102638890 not found in the database. Append empty values.
The gene LOC102638891 not found in the database. Append empty values.
The gene LOC102638935 not found in the database. Append empty values.
The gene LOC10263894

The gene LOC105242422 not found in the database. Append empty values.
The gene LOC105242423 not found in the database. Append empty values.
The gene LOC105242425 not found in the database. Append empty values.
The gene LOC105242426 not found in the database. Append empty values.
The gene LOC105242427 not found in the database. Append empty values.
The gene LOC105242430 not found in the database. Append empty values.
The gene LOC105242431 not found in the database. Append empty values.
The gene LOC105242432 not found in the database. Append empty values.
The gene LOC105242433 not found in the database. Append empty values.
The gene LOC105242435 not found in the database. Append empty values.
The gene LOC105242436 not found in the database. Append empty values.
The gene LOC105242439 not found in the database. Append empty values.
The gene LOC105242440 not found in the database. Append empty values.
The gene LOC105242441 not found in the database. Append empty values.
The gene LOC10524244

The gene LOC105242982 not found in the database. Append empty values.
The gene LOC105242983 not found in the database. Append empty values.
The gene LOC105242985 not found in the database. Append empty values.
The gene LOC105242987 not found in the database. Append empty values.
The gene LOC105242994 not found in the database. Append empty values.
The gene LOC105242997 not found in the database. Append empty values.
The gene LOC105243000 not found in the database. Append empty values.
The gene LOC105243004 not found in the database. Append empty values.
The gene LOC105243005 not found in the database. Append empty values.
The gene LOC105243006 not found in the database. Append empty values.
The gene LOC105243007 not found in the database. Append empty values.
The gene LOC105243008 not found in the database. Append empty values.
The gene LOC105243010 not found in the database. Append empty values.
The gene LOC105243011 not found in the database. Append empty values.
The gene LOC10524301

The gene LOC105243544 not found in the database. Append empty values.
The gene LOC105243545 not found in the database. Append empty values.
The gene LOC105243546 not found in the database. Append empty values.
The gene LOC105243547 not found in the database. Append empty values.
The gene LOC105243548 not found in the database. Append empty values.
The gene LOC105243550 not found in the database. Append empty values.
The gene LOC105243551 not found in the database. Append empty values.
The gene LOC105243552 not found in the database. Append empty values.
The gene LOC105243553 not found in the database. Append empty values.
The gene LOC105243554 not found in the database. Append empty values.
The gene LOC105243555 not found in the database. Append empty values.
The gene LOC105243556 not found in the database. Append empty values.
The gene LOC105243557 not found in the database. Append empty values.
The gene LOC105243561 not found in the database. Append empty values.
The gene LOC10524356

The gene LOC105244122 not found in the database. Append empty values.
The gene LOC105244123 not found in the database. Append empty values.
The gene LOC105244124 not found in the database. Append empty values.
The gene LOC105244127 not found in the database. Append empty values.
The gene LOC105244128 not found in the database. Append empty values.
The gene LOC105244130 not found in the database. Append empty values.
The gene LOC105244131 not found in the database. Append empty values.
The gene LOC105244133 not found in the database. Append empty values.
The gene LOC105244134 not found in the database. Append empty values.
The gene LOC105244136 not found in the database. Append empty values.
The gene LOC105244137 not found in the database. Append empty values.
The gene LOC105244138 not found in the database. Append empty values.
The gene LOC105244139 not found in the database. Append empty values.
The gene LOC105244140 not found in the database. Append empty values.
The gene LOC10524414

The gene LOC105244544 not found in the database. Append empty values.
The gene LOC105244546 not found in the database. Append empty values.
The gene LOC105244548 not found in the database. Append empty values.
The gene LOC105244549 not found in the database. Append empty values.
The gene LOC105244552 not found in the database. Append empty values.
The gene LOC105244554 not found in the database. Append empty values.
The gene LOC105244557 not found in the database. Append empty values.
The gene LOC105244560 not found in the database. Append empty values.
The gene LOC105244561 not found in the database. Append empty values.
The gene LOC105244562 not found in the database. Append empty values.
The gene LOC105244564 not found in the database. Append empty values.
The gene LOC105244566 not found in the database. Append empty values.
The gene LOC105244567 not found in the database. Append empty values.
The gene LOC105244568 not found in the database. Append empty values.
The gene LOC10524456

The gene LOC105245004 not found in the database. Append empty values.
The gene LOC105245005 not found in the database. Append empty values.
The gene LOC105245006 not found in the database. Append empty values.
The gene LOC105245007 not found in the database. Append empty values.
The gene LOC105245008 not found in the database. Append empty values.
The gene LOC105245009 not found in the database. Append empty values.
The gene LOC105245010 not found in the database. Append empty values.
The gene LOC105245011 not found in the database. Append empty values.
The gene LOC105245013 not found in the database. Append empty values.
The gene LOC105245014 not found in the database. Append empty values.
The gene LOC105245016 not found in the database. Append empty values.
The gene LOC105245017 not found in the database. Append empty values.
The gene LOC105245018 not found in the database. Append empty values.
The gene LOC105245020 not found in the database. Append empty values.
The gene LOC10524502

The gene LOC105245443 not found in the database. Append empty values.
The gene LOC105245444 not found in the database. Append empty values.
The gene LOC105245448 not found in the database. Append empty values.
The gene LOC105245450 not found in the database. Append empty values.
The gene LOC105245452 not found in the database. Append empty values.
The gene LOC105245453 not found in the database. Append empty values.
The gene LOC105245456 not found in the database. Append empty values.
The gene LOC105245457 not found in the database. Append empty values.
The gene LOC105245459 not found in the database. Append empty values.
The gene LOC105245460 not found in the database. Append empty values.
The gene LOC105245462 not found in the database. Append empty values.
The gene LOC105245463 not found in the database. Append empty values.
The gene LOC105245467 not found in the database. Append empty values.
The gene LOC105245470 not found in the database. Append empty values.
The gene LOC10524547

The gene LOC105245944 not found in the database. Append empty values.
The gene LOC105245946 not found in the database. Append empty values.
The gene LOC105245947 not found in the database. Append empty values.
The gene LOC105245948 not found in the database. Append empty values.
The gene LOC105245949 not found in the database. Append empty values.
The gene LOC105245951 not found in the database. Append empty values.
The gene LOC105245952 not found in the database. Append empty values.
The gene LOC105245953 not found in the database. Append empty values.
The gene LOC105245955 not found in the database. Append empty values.
The gene LOC105245957 not found in the database. Append empty values.
The gene LOC105245959 not found in the database. Append empty values.
The gene LOC105245960 not found in the database. Append empty values.
The gene LOC105245961 not found in the database. Append empty values.
The gene LOC105245962 not found in the database. Append empty values.
The gene LOC10524596

The gene LOC105246375 not found in the database. Append empty values.
The gene LOC105246377 not found in the database. Append empty values.
The gene LOC105246380 not found in the database. Append empty values.
The gene LOC105246383 not found in the database. Append empty values.
The gene LOC105246385 not found in the database. Append empty values.
The gene LOC105246386 not found in the database. Append empty values.
The gene LOC105246388 not found in the database. Append empty values.
The gene LOC105246389 not found in the database. Append empty values.
The gene LOC105246390 not found in the database. Append empty values.
The gene LOC105246393 not found in the database. Append empty values.
The gene LOC105246394 not found in the database. Append empty values.
The gene LOC105246395 not found in the database. Append empty values.
The gene LOC105246396 not found in the database. Append empty values.
The gene LOC105246397 not found in the database. Append empty values.
The gene LOC10524639

The gene LOC105246831 not found in the database. Append empty values.
The gene LOC105246832 not found in the database. Append empty values.
The gene LOC105246835 not found in the database. Append empty values.
The gene LOC105246836 not found in the database. Append empty values.
The gene LOC105246839 not found in the database. Append empty values.
The gene LOC105246840 not found in the database. Append empty values.
The gene LOC105246841 not found in the database. Append empty values.
The gene LOC105246843 not found in the database. Append empty values.
The gene LOC105246844 not found in the database. Append empty values.
The gene LOC105246845 not found in the database. Append empty values.
The gene LOC105246846 not found in the database. Append empty values.
The gene LOC105246847 not found in the database. Append empty values.
The gene LOC105246848 not found in the database. Append empty values.
The gene LOC105246849 not found in the database. Append empty values.
The gene LOC10524685

The gene LOC432842 not found in the database. Append empty values.
The gene LOC433198 not found in the database. Append empty values.
The gene LOC436100 not found in the database. Append empty values.
The gene LOC545466 not found in the database. Append empty values.
The gene LOC545966 not found in the database. Append empty values.
The gene LOC547334 not found in the database. Append empty values.
The gene LOC628147 not found in the database. Append empty values.
The gene LOC73899 not found in the database. Append empty values.
The gene Lace1 not found in the database. Append empty values.
The gene Large not found in the database. Append empty values.
The gene Ldoc1l not found in the database. Append empty values.
The gene Lect1 not found in the database. Append empty values.
The gene Lincenc1 not found in the database. Append empty values.
The gene Lincpint not found in the database. Append empty values.
The gene Lincred1 not found in the database. Append empty values.
The gene Lincr

The gene Pcnxl2 not found in the database. Append empty values.
The gene Pcnxl3 not found in the database. Append empty values.
The gene Pcnxl4 not found in the database. Append empty values.
The gene Pddc1 not found in the database. Append empty values.
The gene Peo1 not found in the database. Append empty values.
The gene Pet2 not found in the database. Append empty values.
The gene Phf5b-ps not found in the database. Append empty values.
The gene Phxr4 not found in the database. Append empty values.
The gene Pla2g16 not found in the database. Append empty values.
The gene Pomc-ps1 not found in the database. Append empty values.
The gene Ppap2a not found in the database. Append empty values.
The gene Ppap2b not found in the database. Append empty values.
The gene Ppap2c not found in the database. Append empty values.
The gene Ppapdc1a not found in the database. Append empty values.
The gene Ppapdc1b not found in the database. Append empty values.
The gene Ppapdc2 not found in the dat

The gene Taf4a not found in the database. Append empty values.
The gene Tbrg3 not found in the database. Append empty values.
The gene Tceb1 not found in the database. Append empty values.
The gene Tceb2 not found in the database. Append empty values.
The gene Tceb3 not found in the database. Append empty values.
The gene Tcra not found in the database. Append empty values.
The gene Tcrb not found in the database. Append empty values.
The gene Tcrg not found in the database. Append empty values.
The gene Tdpx-ps1 not found in the database. Append empty values.
The gene Tenc1 not found in the database. Append empty values.
Multiple gene biotype exists for Terc. Merge the result.
The gene Tex13 not found in the database. Append empty values.
The gene Tex40 not found in the database. Append empty values.
The gene Tfdp1-ps not found in the database. Append empty values.
The gene Tldc1 not found in the database. Append empty values.
The gene Tmem110 not found in the database. Append empty v

In [16]:
Mop_gene_df = pd.DataFrame.from_dict(Mop_gene_dict)
Mop_gene_df = Mop_gene_df.set_index('gene')

Mop_gene_df

Unnamed: 0_level_0,chr,start,end,gene_biotype,coding_strand
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0610005C13Rik,7,45567794,45575327,antisense,-1
0610007P14Rik,,,,,
0610009B22Rik,11,51685386,51688874,protein_coding,-1
0610009E02Rik,2,26445696,26459390,processed_transcript,1
0610009L18Rik,11,120348678,120351190,bidirectional_promoter_lncRNA,1
...,...,...,...,...,...
Zyx,6,42349630,42360213,protein_coding,1
Zzef1,11,72796226,72927120,protein_coding,1
Zzz3,3,152395473,152462826,protein_coding,1
a,2,154791402,155051012,protein_coding,1


In [18]:
Mop_gene_df.dropna()

Unnamed: 0_level_0,chr,start,end,gene_biotype,coding_strand
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0610005C13Rik,7,45567794,45575327,antisense,-1
0610009B22Rik,11,51685386,51688874,protein_coding,-1
0610009E02Rik,2,26445696,26459390,processed_transcript,1
0610009L18Rik,11,120348678,120351190,bidirectional_promoter_lncRNA,1
0610010F05Rik,11,23564961,23633639,protein_coding,-1
...,...,...,...,...,...
Zyg11b,4,108229724,108301096,protein_coding,-1
Zyx,6,42349630,42360213,protein_coding,1
Zzef1,11,72796226,72927120,protein_coding,1
Zzz3,3,152395473,152462826,protein_coding,1


# 4. Add formated column for downstream analysis

In [3]:
#pool_folder

'L:\\Shiwei\\DNA_MERFISH_analysis\\SMARTer_nuclei_MOp'

In [19]:
# load gene annotation (covering all genes from the SMART-seq) for chr locus
#pool_folder = r'L:\Shiwei\DNA_MERFISH_analysis\SMARTer_nuclei_MOp'
#clean_gene_df = pd.read_csv(os.path.join(pool_folder, "MOp_smart_sn_gene_chr_info_NEW_from_transcriptome.csv"), index_col=0)

clean_gene_df = Mop_gene_df

# Append a column for gene length
clean_gene_df['length']=clean_gene_df['end']-clean_gene_df['start']
clean_gene_df=clean_gene_df.dropna()

clean_gene_df['start'] = clean_gene_df['start'].map(lambda x: int(x))
clean_gene_df['end'] = clean_gene_df['end'].map(lambda x: int(x))
clean_gene_df['length'] = clean_gene_df['length'].map(lambda x: int(x))

print(len(clean_gene_df))
clean_gene_df.head()

24596


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_gene_df['start'] = clean_gene_df['start'].map(lambda x: int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_gene_df['end'] = clean_gene_df['end'].map(lambda x: int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_gene_df['length'] = clean_gene_df['length'].map(lambda x: int(x

Unnamed: 0_level_0,chr,start,end,gene_biotype,coding_strand,length
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0610005C13Rik,7,45567794,45575327,antisense,-1,7533
0610009B22Rik,11,51685386,51688874,protein_coding,-1,3488
0610009E02Rik,2,26445696,26459390,processed_transcript,1,13694
0610009L18Rik,11,120348678,120351190,bidirectional_promoter_lncRNA,1,2512
0610010F05Rik,11,23564961,23633639,protein_coding,-1,68678


In [9]:
_gene_df['chr']

'7'

In [20]:
# add a genomic_position column to facilitate downstream analysis
_chr_loc_list = []
for _gene_df in clean_gene_df.iloc():
    _chr_loc = 'chr' + _gene_df['chr'] + '_' + str(_gene_df['start']) + '_' + str(_gene_df['end'])
    _chr_loc_list.append(_chr_loc)
    
    
    
clean_gene_df_new = clean_gene_df.copy()
clean_gene_df_new['genomic_position'] = _chr_loc_list

clean_gene_df_new

Unnamed: 0_level_0,chr,start,end,gene_biotype,coding_strand,length,genomic_position
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0610005C13Rik,7,45567794,45575327,antisense,-1,7533,chr7_45567794_45575327
0610009B22Rik,11,51685386,51688874,protein_coding,-1,3488,chr11_51685386_51688874
0610009E02Rik,2,26445696,26459390,processed_transcript,1,13694,chr2_26445696_26459390
0610009L18Rik,11,120348678,120351190,bidirectional_promoter_lncRNA,1,2512,chr11_120348678_120351190
0610010F05Rik,11,23564961,23633639,protein_coding,-1,68678,chr11_23564961_23633639
...,...,...,...,...,...,...,...
Zyg11b,4,108229724,108301096,protein_coding,-1,71372,chr4_108229724_108301096
Zyx,6,42349630,42360213,protein_coding,1,10583,chr6_42349630_42360213
Zzef1,11,72796226,72927120,protein_coding,1,130894,chr11_72796226_72927120
Zzz3,3,152395473,152462826,protein_coding,1,67353,chr3_152395473_152462826


In [21]:
# save the queried Mop gene df

clean_gene_df_new.to_csv(os.path.join(pool_folder, "MOp_smart_sn_gene_chr_info_NEW_from_transcriptome_FORMAT.csv"), index=True)