In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy import stats

Dataset

In [2]:
data = pd.read_csv("snp_data.csv")
data

Unnamed: 0,ID,trait,snp1_A/G,snp2_C/T,snp3_G/T
0,APID0001,-0.909573,1.0,2.0,1.0
1,APID0002,-1.042890,0.0,1.0,2.0
2,APID0003,-0.564743,1.0,1.0,0.0
3,APID0004,0.281898,1.0,0.0,1.0
4,APID0005,-1.119414,0.0,2.0,1.0
...,...,...,...,...,...
663,APID0664,-1.395477,2.0,2.0,0.0
664,APID0665,0.484364,0.0,0.0,0.0
665,APID0666,0.273132,1.0,1.0,2.0
666,APID0667,-0.888352,1.0,1.0,0.0


In [3]:
data = data.dropna()
data

Unnamed: 0,ID,trait,snp1_A/G,snp2_C/T,snp3_G/T
0,APID0001,-0.909573,1.0,2.0,1.0
1,APID0002,-1.042890,0.0,1.0,2.0
2,APID0003,-0.564743,1.0,1.0,0.0
3,APID0004,0.281898,1.0,0.0,1.0
4,APID0005,-1.119414,0.0,2.0,1.0
...,...,...,...,...,...
663,APID0664,-1.395477,2.0,2.0,0.0
664,APID0665,0.484364,0.0,0.0,0.0
665,APID0666,0.273132,1.0,1.0,2.0
666,APID0667,-0.888352,1.0,1.0,0.0


SNP1 - snp1_A/G


In [4]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_1_A2A2 = data[data['snp1_A/G']==0]['trait'].mean()
Mean_snp_1_A2A2

-0.03616904406639003

In [5]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_1_A1A2 = data[data['snp1_A/G']==1]['trait'].mean()
Mean_snp_1_A1A2

-0.0918692706354515

In [6]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_1_A1A1 = data[data['snp1_A/G']==2]['trait'].mean()
Mean_snp_1_A1A1

-0.0820235386440678

In [7]:
# Origin, a/d model
O_snp1 = (Mean_snp_1_A2A2 + Mean_snp_1_A1A1) / 2 # Origin for SNP 1
a_snp1 = (Mean_snp_1_A1A1 - O_snp1) # a for SNP 1
d_snp1 = (Mean_snp_1_A1A2 - O_snp1) # a for SNP 1
print('Origin for snp1:', O_snp1)
print('a for snp1:', a_snp1)
print('d for snp1:', d_snp1)

Origin for snp1: -0.05909629135522892
a for snp1: -0.022927247288838888
d for snp1: -0.03277297928022258


SNP2 - snp2_C/T

In [8]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_2_A2A2 = data[data['snp2_C/T']==0]['trait'].mean()
Mean_snp_2_A2A2

0.13440007876146787

In [9]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_2_A1A2 = data[data['snp2_C/T']==1]['trait'].mean()
Mean_snp_2_A1A2

-0.11927977410029497

In [10]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_2_A1A1 = data[data['snp2_C/T']==2]['trait'].mean()
Mean_snp_2_A1A1

-0.3438396321782178

In [11]:
# Origin, a/d model
O_snp2 = (Mean_snp_2_A2A2 + Mean_snp_2_A1A1) / 2 # Origin for SNP 2
a_snp2 = (Mean_snp_2_A1A1 - O_snp2) # a for SNP 2
d_snp2 = (Mean_snp_2_A1A2 - O_snp2) # a for SNP 2
print('Origin for snp2:', O_snp2)
print('a for snp2:', a_snp2)
print('d for snp2:', d_snp2)

Origin for snp2: -0.10471977670837496
a for snp2: -0.23911985546984282
d for snp2: -0.014559997391920015


In [16]:
p_snp2 = 0.412

In [18]:
alpha_snp2 = a_snp2 + d_snp2 * (1 - 2 * p_snp2)
print('allele substitution for snp2:', alpha_snp2)

allele substitution for snp2: -0.24168241501082074


SNP3 - snp3_G/T


In [None]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_3_A2A2 = data[data['snp3_G/T']==0]['trait'].mean()
Mean_snp_3_A2A2

0.07890513753424659

In [None]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_3_A1A2 = data[data['snp3_G/T']==1]['trait'].mean()
Mean_snp_3_A1A2

-0.15881398111801243

In [None]:
# A2A2=0, A1A2=1, A1A1=2
Mean_snp_3_A1A1 = data[data['snp3_G/T']==2]['trait'].mean()
Mean_snp_3_A1A1

-0.10262010512820512

In [None]:
# Origin, a/d model
O_snp3 = (Mean_snp_3_A2A2 + Mean_snp_3_A1A1) / 2 # Origin for SNP 2
a_snp3 = (Mean_snp_3_A1A1 - O_snp3) # a for SNP 2
d_snp3 = (Mean_snp_3_A1A2 - O_snp3) # a for SNP 2
print('Origin for snp2:', O_snp3)
print('a for snp2:', a_snp3)
print('d for snp2:', d_snp3)

Origin for snp2: -0.011857483796979262
a for snp2: -0.09076262133122585
d for snp2: -0.14695649732103316
