# PhyloP_mean Analysis 

Calculate mean phyloP score across exons per gene for mouse. 

In [2]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

# work with .bw files
import pyBigWig

### Big Wig PhyloP Data

In [3]:
configfile = "/Users/philippasteinberg/Desktop/Project-AgeExpressionConstraint/analysis/phyloP/config.json"
with open(configfile, "r") as f:
        config = json.load(f)

In [4]:
phylo_scores = pyBigWig.open(config["Mouse_PhyloP"])

In [5]:
mus_coor = pd.read_csv(config["knownGene"], header = None, sep = "\t")

  mus_coor = pd.read_csv(config["knownGene"], header = None, sep = "\t")


In [6]:
mus_coor = mus_coor.rename(
    columns={
        0:"names", 
        1:"chrom", 
        2:"strand", 
        3:"txStart", 
        4:"txEnd", 
        5:"cdsStart", 
        6:"cdsEnd", 
        7:"exonCount", 
        8:"exonStarts", 
        9:"exonEnds", 
        10:"proteinID", 
        11:"alignID"
        }
    )

In [7]:
mus_coor.head()

Unnamed: 0,names,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,proteinID,alignID
0,ENSMUST00000193812.1,chr1,+,3073252,3074322,3073252,3073252,1,3073252,3074322,,uc287gdb.1
1,ENSMUST00000082908.1,chr1,+,3102015,3102125,3102015,3102015,1,3102015,3102125,,uc287gdc.1
2,ENSMUST00000162897.1,chr1,-,3205900,3216344,3205900,3205900,2,32059003213608,32073173216344,,uc287gdd.1
3,ENSMUST00000159265.1,chr1,-,3206522,3215632,3206522,3206522,2,32065223213438,32073173215632,,uc007aet.2
4,ENSMUST00000070533.4,chr1,-,3214481,3671498,3216021,3671348,3,321448134217013670551,321696834219013671498,Q5GH67,uc007aeu.1


## Calculate PhyloP score mean across all exons per gene

In [8]:
chrs = []
gene = []   
start = []   
end = []   
phyloP = [] 

In [9]:
for i in np.arange(len(mus_coor)):
    chrs.append(mus_coor["chrom"][i])
    gene.append(mus_coor["names"][i])  
        
    start = [int(i) for i in mus_coor["exonStarts"][i].split(",")[0:-1]]
    end = [int(i) for i in mus_coor["exonEnds"][i].split(",")[0:-1]]
    # use bw tools to get phyloP score for all exon per gene
    exons = [phylo_scores.stats(mus_coor["chrom"][i], start[i], end[i], exact=True) for i in np.arange(len(start))]

    # print(i)
    # print("start", start)
    # print("end", end)
    # print("exons", exons)

    # removing all the none values (no phyloP score for exon)
    res = [j for j in exons if j[0]]
    # print("res", res)
    
    if res:
        # calculate mean across exons per gene ID
        phyloP.append(np.mean(res))
    else:
        # add "None" back to account for different lengths of arrays
        phyloP.append(None)

In [11]:
# create data frame with all genes, transcripts, exons, and their phyloP ave
mus_gene_mean = {"Chromosome name": chrs, "Transcript stable ID version": gene, "Mouse exon phyloP mean": phyloP}
mus_gene_mean_df = pd.DataFrame(mus_gene_mean)
mus_gene_mean_df["Transcript stable ID"] = mus_gene_mean_df["Transcript stable ID version"].str.split('.').str[0]
mus_gene_mean_df = mus_gene_mean_df[["Chromosome name", "Transcript stable ID", "Transcript stable ID version", "Mouse exon phyloP mean"]]
mus_gene_mean_df.head()

Unnamed: 0,Chromosome name,Transcript stable ID,Transcript stable ID version,Mouse exon phyloP mean
0,chr1,ENSMUST00000193812,ENSMUST00000193812.1,0.230201
1,chr1,ENSMUST00000082908,ENSMUST00000082908.1,0.027273
2,chr1,ENSMUST00000162897,ENSMUST00000162897.1,0.777432
3,chr1,ENSMUST00000159265,ENSMUST00000159265.1,0.778998
4,chr1,ENSMUST00000070533,ENSMUST00000070533.4,2.710427


In [12]:
save = mus_gene_mean_df.to_csv(config["PhyloP_mean"])