In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import arviz as az
import pymc as pm
import corner
import scipy.stats as st
from statistics import mean
from collections import Counter

In [2]:
file_path = 'Datasets/mt_genes_metadata.csv'

met = pd.read_csv(file_path)

pro_code=met[met['gene_type']=='protein_coding']
intpro=pro_code['gene_name'].tolist()
intpro=['MT-ND1','MT-ND2','MT-ND3','MT-ND4','MT-ND4L','MT-ND5','MT-ND6', 'MT-CO1', 'MT-CO2','MT-CO3', 'MT-ATP6', 'MT-ATP8', 'MT-CYB']
intpro

['MT-ND1',
 'MT-ND2',
 'MT-ND3',
 'MT-ND4',
 'MT-ND4L',
 'MT-ND5',
 'MT-ND6',
 'MT-CO1',
 'MT-CO2',
 'MT-CO3',
 'MT-ATP6',
 'MT-ATP8',
 'MT-CYB']

In [3]:
file_path = 'Datasets/Donor1_CD4_Genes.csv'

p14 = pd.read_csv(file_path)

p14.drop(columns=p14.columns[0], axis=1, inplace=True)

pat1_4int=pd.DataFrame([p14[i] for i in intpro]).T

file_path = 'Datasets/Donor1_CD8_Genes.csv'

p18 = pd.read_csv(file_path)

p18.drop(columns=p18.columns[0], axis=1, inplace=True)

pat1_8int=pd.DataFrame([p18[i] for i in intpro]).T

file_path = 'Datasets/Donor2_CD4_Genes.csv'

p24 = pd.read_csv(file_path)

p24.drop(columns=p24.columns[0], axis=1, inplace=True)

pat2_4int=pd.DataFrame([p24[i] for i in intpro]).T

file_path = 'Datasets/Donor2_CD8_Genes.csv'

p28 = pd.read_csv(file_path)

p28.drop(columns=p28.columns[0], axis=1, inplace=True)

pat2_8int=pd.DataFrame([p28[i] for i in intpro]).T

total=pd.concat([p14,p18,p24,p28])

total_int=pd.concat([pat1_4int,pat1_8int, pat2_4int,pat2_8int])

totaln=len(total_int)


In [4]:
total_sampto=total.sum() #total number of each gene
totalgene=total_sampto.sum() #total number of genes

In [5]:
#number of DNA strands in each file
p14n=len(pat1_4int)
p18n=len(pat1_8int)
p24n=len(pat2_4int)
p28n=len(pat2_8int)

In [6]:
#Calculates the number of genes for each DNA
p14_allsampto=p14.sum(axis=1)
p18_allsampto=p18.sum(axis=1)
p24_allsampto=p24.sum(axis=1)
p28_allsampto=p28.sum(axis=1)

Normalising data and then computing Unbiased estimate of neg binom proportion

In [43]:
r=1000
uem14_list=[]
for j in intpro:
    uem_sample_list=[]
    for i in range(p14n):   
        norm_cons=(p14_allsampto[i]-p14[j][i])/r #inidividual normalisation constant
        x=p14[j][i]/norm_cons
        uem=(r-1)/(x+r-1)
        uem_sample_list.append(uem)
    uem14_list.append(mean(uem_sample_list))
uem14_list=dict(zip(intpro,uem14_list))

uem18_list=[]
for j in intpro:
    uem_sample_list=[]
    for i in range(p18n):   
        norm_cons=(p18_allsampto[i]-p18[j][i])/r #inidividual normalisation constant
        x=p18[j][i]/norm_cons
        uem=(r-1)/(x+r-1)
        uem_sample_list.append(uem)
    uem18_list.append(mean(uem_sample_list))
uem18_list=dict(zip(intpro,uem18_list))

uem24_list=[]
for j in intpro:
    uem_sample_list=[]
    for i in range(p24n):   
        norm_cons=(p24_allsampto[i]-p24[j][i])/r #inidividual normalisation constant
        x=p24[j][i]/norm_cons
        uem=(r-1)/(x+r-1)
        uem_sample_list.append(uem)
    uem24_list.append(mean(uem_sample_list))
uem24_list=dict(zip(intpro,uem24_list))

uem28_list=[]
for j in intpro:
    uem_sample_list=[]
    for i in range(p28n):   
        norm_cons=(p28_allsampto[i]-p28[j][i])/r #inidividual normalisation constant
        x=p28[j][i]/norm_cons
        uem=(r-1)/(x+r-1)
        uem_sample_list.append(uem)
    uem28_list.append(mean(uem_sample_list))
uem28_list=dict(zip(intpro,uem28_list))
uem28_list

{'MT-ND1': 0.9973631963764747,
 'MT-ND2': 0.9988064883326756,
 'MT-ND3': 0.997476549325719,
 'MT-ND4': 0.9986610119965768,
 'MT-ND4L': 0.9968742389123535,
 'MT-ND5': 0.9981162122324965,
 'MT-ND6': 0.9994244858081606,
 'MT-CO1': 0.9941385071004302,
 'MT-CO2': 0.9929643507462625,
 'MT-CO3': 0.9947905945989968,
 'MT-ATP6': 0.9991827989556595,
 'MT-ATP8': 0.9958575966868964,
 'MT-CYB': 0.9949783189737844}

χ² test: H0: Follows nbin, H1: Doesn't Follow

In [60]:
#p1_4
df14=p14n-1
cv=st.chi2.ppf(0.99,df14)
chi1_p14_ts=[]
chi1_p14_res=[]
for j in intpro:
    ts=0
    p=uem14_list[j]
    for i in range(p14n):
        r=1000
        norm_cons=(p14_allsampto[i]-p14[j][i])/r #inidividual normalisation constant
        o=p14[j][i]/norm_cons
        e=r*(1-p)/p
        ts+=(e-o)**2/e
    chi1_p14_ts.append(ts)
    chi1_p14_res.append(ts-cv)
chi_p14_res=dict(zip(intpro,chi1_p14_res))
chi_p14_res

{'MT-ND1': -259.1006299814303,
 'MT-ND2': -994.5354100216814,
 'MT-ND3': -1110.4332543286637,
 'MT-ND4': -1385.0219285706173,
 'MT-ND4L': 851.3411864570539,
 'MT-ND5': -1156.954331634691,
 'MT-ND6': -1561.9014019043182,
 'MT-CO1': 3559.863627739884,
 'MT-CO2': 4969.570977377631,
 'MT-CO3': 3287.834732136768,
 'MT-ATP6': -1839.3653492497158,
 'MT-ATP8': 3207.6709603345557,
 'MT-CYB': 1849.5343145344264}

In [61]:
#p1_8
df18=p18n-1
cv=st.chi2.ppf(0.99,df18)
chi1_p18_ts=[]
chi1_p18_res=[]
for j in intpro:
    ts=0
    p=uem18_list[j]
    for i in range(p18n):
        r=1000
        norm_cons=(p18_allsampto[i]-p18[j][i])/r #inidividual normalisation constant
        o=p18[j][i]/norm_cons
        e=r*(1-p)/p
        ts+=(e-o)**2/e
    chi1_p18_ts.append(ts)
    chi1_p18_res.append(ts-cv)
chi_p18_res=dict(zip(intpro,chi1_p18_res))
chi_p18_res

{'MT-ND1': -1086.2564879612235,
 'MT-ND2': -1227.267873724294,
 'MT-ND3': -1285.5864712820774,
 'MT-ND4': -1399.139664297534,
 'MT-ND4L': -462.6046495251087,
 'MT-ND5': -927.546691428279,
 'MT-ND6': -1259.37664232384,
 'MT-CO1': 960.0339049903091,
 'MT-CO2': 823.0536060287031,
 'MT-CO3': -51.17586855621357,
 'MT-ATP6': -1433.6842835822417,
 'MT-ATP8': 470.0422335468329,
 'MT-CYB': -36.365668770027696}

In [62]:
#p2_4
df24=p24n-1
cv=st.chi2.ppf(0.99,df24)
chi1_p24_ts=[]
chi1_p24_res=[]
for j in intpro:
    ts=0
    p=uem24_list[j]
    for i in range(p24n):
        r=1000
        norm_cons=(p24_allsampto[i]-p24[j][i])/r #inidividual normalisation constant
        o=p24[j][i]/norm_cons
        e=r*(1-p)/p
        ts+=(e-o)**2/e
    chi1_p24_ts.append(ts)
    chi1_p24_res.append(ts-cv)
chi_p24_res=dict(zip(intpro,chi1_p24_res))
chi_p24_res

{'MT-ND1': -1705.0639372465807,
 'MT-ND2': -2278.378735221855,
 'MT-ND3': -1845.2322835038221,
 'MT-ND4': -2501.2593313618245,
 'MT-ND4L': -1602.5596948634711,
 'MT-ND5': -2206.707468257585,
 'MT-ND6': -1875.4958358095714,
 'MT-CO1': 1955.0110015023083,
 'MT-CO2': 626.2349330615343,
 'MT-CO3': -583.5652924087162,
 'MT-ATP6': -2515.9541966190227,
 'MT-ATP8': -338.7471345521785,
 'MT-CYB': -213.1905640435939}

In [63]:
#p2_8
df28=p28n-1
cv=st.chi2.ppf(0.99,df28)
chi1_p28_ts=[]
chi1_p28_res=[]
for j in intpro:
    ts=0
    p=uem28_list[j]
    for i in range(p28n):
        r=1000
        norm_cons=(p28_allsampto[i]-p28[j][i])/r #inidividual normalisation constant
        o=p28[j][i]/norm_cons
        e=r*(1-p)/p
        ts+=(e-o)**2/e
    chi1_p28_ts.append(ts)
    chi1_p28_res.append(ts-cv)
chi_p28_res=dict(zip(intpro,chi1_p28_res))
chi_p28_res

{'MT-ND1': -476.8398241138193,
 'MT-ND2': -760.0681305097527,
 'MT-ND3': -583.9310585132043,
 'MT-ND4': -743.3713364365121,
 'MT-ND4L': -470.84439198819655,
 'MT-ND5': -655.6376203431214,
 'MT-ND6': -252.0406293457654,
 'MT-CO1': 709.0037794292825,
 'MT-CO2': 278.1469399556938,
 'MT-CO3': -89.20066488222665,
 'MT-ATP6': -961.1270513844944,
 'MT-ATP8': -101.73688722536463,
 'MT-CYB': 225.3741148597535}

In [64]:
chi_p14_resdf=pd.Series(chi_p14_res)
chi_p14_resdf=pd.DataFrame(chi_p14_resdf,columns=['P1_4'])
chi_p18_resdf=pd.Series(chi_p18_res)
chi_p18_resdf=pd.DataFrame(chi_p18_resdf,columns=['P1_8'])
chi_p24_resdf=pd.Series(chi_p24_res)
chi_p24_resdf=pd.DataFrame(chi_p24_resdf,columns=['P2_4'])
chi_p28_resdf=pd.Series(chi_p28_res)
chi_p28_resdf=pd.DataFrame(chi_p28_resdf,columns=['P2_8'])
overall_chi_res=pd.concat([chi_p14_resdf,chi_p18_resdf,chi_p24_resdf, chi_p28_resdf], axis=1)

In [65]:
overall_chi_res

Unnamed: 0,P1_4,P1_8,P2_4,P2_8
MT-ND1,-259.10063,-1086.256488,-1705.063937,-476.839824
MT-ND2,-994.53541,-1227.267874,-2278.378735,-760.068131
MT-ND3,-1110.433254,-1285.586471,-1845.232284,-583.931059
MT-ND4,-1385.021929,-1399.139664,-2501.259331,-743.371336
MT-ND4L,851.341186,-462.60465,-1602.559695,-470.844392
MT-ND5,-1156.954332,-927.546691,-2206.707468,-655.63762
MT-ND6,-1561.901402,-1259.376642,-1875.495836,-252.040629
MT-CO1,3559.863628,960.033905,1955.011002,709.003779
MT-CO2,4969.570977,823.053606,626.234933,278.14694
MT-CO3,3287.834732,-51.175869,-583.565292,-89.200665


Below is unrefined stuff

In [71]:
#p1_4
df14=p14n-2
cv=st.chi2.ppf(0.95,df14)
chi2_p14_ts=[]
chi2_p14_res=[]
for j in intpro:
    ts=0
    p=uem14t_list[j]
    for i in range(p14n):
        if p14_allsampto[i]==0:
            ts+=0
        else:
            r=p14_allsampto[i]-pat1_4int[j][i]
            o=pat1_4int[j][i]
            e=r*(1-p)/p
            ts+=(e-o)**2/e
    chi2_p14_ts.append(ts)
    chi2_p14_res.append(ts-cv)
chi2_p14_res

[4111.197925329632,
 3043.734773332208,
 2419.2040819217004,
 1987.8232946176636,
 6403.878663851136,
 2852.984004617039,
 1548.3527769126326,
 13955.610254847617,
 15100.751960680309,
 11456.024761625824,
 984.9264501561092,
 11442.159314060757,
 9233.692961764658]