# Study 01 - Registered report: A coding-independent function of gene and pseudogene mRNAs regulates tumour biology
## Protocol 4 to 6

In [115]:
import os
import pandas as pd
import scipy
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.api import qqplot
import statsmodels
import numpy as np
from patsy import dmatrices

### Protocol 4: Western blot of cells transfected with siRNA

Confirmatory analysis plan <br />
This replication attempt will perform the following statistical analysis listed below.<br />
■Statistical Analysis:<br />
○Note: at the time of analysis, we will perform the Shapiro–Wilk test and generate a quantile–quantile plot to assess the normality of the data. We will also perform Levene’s test to assess homoscedasticity. If the data appear skewed we will perform the appropriate transformation inorder to proceed with the proposed statistical analysis. If this is not possible we will perform the equivalent non-parametric test.<br />


In [116]:
#read in data
df4=pd.read_csv("./data/protocol4_dataextracted.csv")
print(df4)

       Unnamed: 0  Repeat 1  Repeat 2  Repeat 3  Repeat 4  Repeat 5  Average   
0   Untransfected     1.190     1.092     0.636     1.105     1.434   1.0914  \
1          Si-LUC     1.000     1.000     1.000     1.000     1.000   1.0000   
2         Si-Pten     0.810     0.278     0.298     0.250     0.000   0.3272   
3       Si-Ptenp1     0.663     0.611     0.632     0.575     0.777   0.6516   
4  Si-Pten/Ptenp1     0.169     0.217     0.126     0.256     0.031   0.1598   

   St deviation  
0      0.289254  
1      0.000000  
2      0.295546  
3      0.077064  
4      0.087096  


In [117]:
df4_untrans = df4.loc[df4['Unnamed: 0'] == 'Untransfected']
df4_untrans = df4_untrans.iloc[:,1:6]
print("df4_untrans \t",df4_untrans)
df4_Si_LUC = df4.loc[df4['Unnamed: 0'] == 'Si-LUC']
df4_Si_LUC = df4_Si_LUC.iloc[:,1:6]
print("df4_Si_LUC \t",df4_Si_LUC)
df4_Si_Pten = df4.loc[df4['Unnamed: 0'] == 'Si-Pten']
df4_Si_Pten = df4_Si_Pten.iloc[:,1:6]
print("df4_Si_Pten \t",df4_Si_Pten)
df4_Si_Ptenp1 = df4.loc[df4['Unnamed: 0'] == 'Si-Ptenp1']
df4_Si_Ptenp1 = df4_Si_Ptenp1.iloc[:,1:6]
print("df4_Si_Ptenp1 \t",df4_Si_Ptenp1)
df4_Si_Pten_Ptenp1 = df4.loc[df4['Unnamed: 0'] == 'Si-Pten/Ptenp1']
df4_Si_Pten_Ptenp1 = df4_Si_Pten_Ptenp1.iloc[:,1:6]
print("df4_Si_Pten_Ptenp1",df4_Si_Pten_Ptenp1)

df4_untrans 	    Repeat 1  Repeat 2  Repeat 3  Repeat 4  Repeat 5
0      1.19     1.092     0.636     1.105     1.434
df4_Si_LUC 	    Repeat 1  Repeat 2  Repeat 3  Repeat 4  Repeat 5
1       1.0       1.0       1.0       1.0       1.0
df4_Si_Pten 	    Repeat 1  Repeat 2  Repeat 3  Repeat 4  Repeat 5
2      0.81     0.278     0.298      0.25       0.0
df4_Si_Ptenp1 	    Repeat 1  Repeat 2  Repeat 3  Repeat 4  Repeat 5
3     0.663     0.611     0.632     0.575     0.777
df4_Si_Pten_Ptenp1    Repeat 1  Repeat 2  Repeat 3  Repeat 4  Repeat 5
4     0.169     0.217     0.126     0.256     0.031


In [118]:
#Shapiro-Wilk test for normality
print("Shapiro untransfected: ",scipy.stats.shapiro(df4_untrans.iloc[0]))
print("Shapiro Si_LUC: ",scipy.stats.shapiro(df4_Si_LUC.iloc[0]))
print("Shapiro Si_Pten: ",scipy.stats.shapiro(df4_Si_Pten.iloc[0]))
print("Shapiro Si_Ptenp1: ",scipy.stats.shapiro(df4_Si_Ptenp1.iloc[0]))
print("Shapiro Si_Pten_Ptenp1: ",scipy.stats.shapiro(df4_Si_Pten_Ptenp1.iloc[0]))

Shapiro untransfected:  ShapiroResult(statistic=0.9171846508979797, pvalue=0.5119431018829346)
Shapiro Si_LUC:  ShapiroResult(statistic=1.0, pvalue=1.0)
Shapiro Si_Pten:  ShapiroResult(statistic=0.8656184673309326, pvalue=0.24913133680820465)
Shapiro Si_Ptenp1:  ShapiroResult(statistic=0.9072771072387695, pvalue=0.4514017403125763)
Shapiro Si_Pten_Ptenp1:  ShapiroResult(statistic=0.9694178700447083, pvalue=0.8715044260025024)




p value greater than the chosen alpha level --> null hypothesis cannot be rejected

In [119]:
print(df4.iloc[0,1:6])

Repeat 1     1.19
Repeat 2    1.092
Repeat 3    0.636
Repeat 4    1.105
Repeat 5    1.434
Name: 0, dtype: object


In [122]:
levene_rows = []
normalisers = {}
for row_name in [0,1,2,3,4]:
    qqplot(df4[row_name])
    levene_rows.append(data[row_name])
    normalisers[row_name] = StandardScaler().fit([[datapoint] for datapoint in data[row_name]])
print(f"Levene: {levene(*levene_rows[:-1])}")

KeyError: 0

○Two-way ANOVA of normalized PTEN levels in siLuc, siPTEN, siPTENP1, or siPTEN/PTENP1 siRNA transfected cells with the following planned comparisons using the Bonferroni correction:<br />
1. siLuc compared to siPTEN.<br />
2. siLuc compared to siPTENP1.<br />
3. siLuc compared to siPTEN/PTENP1.<br />
4. siPTEN/PTENP1 compared to siPTEN.<br />
5. siPTEN/PTENP1 compared to siPTENP1.<br />

### Protocol 5: Quantitative PCR following PTEN 3′UTR transfection

Confirmatory analysis plan<br />
This replication attempt will perform the following statistical analysis listed below.<br />
■Statistical Analysis:<br />
○Note: at the time of analysis, we will perform the Shapiro–Wilk test and generate a quantile–quantileplot to assess the normality of the data. We will also perform Levene’s test to assess homoscedasticity.If the data appear skewed we will perform the appropriate transformation in order to proceed with theproposed statistical analysis. If this is not possible we will perform the equivalent non-parametric test. <br />

○Unpaired two-tailedt-test ofPTENP1mRNA levels of pCMV transfected cells compared topCMV/PTEN3′UTR transfected cells.

### Protocol 6: Cell growth assay following PTEN 3′UTR transfection

Confirmatory analysis plan<br />
This replication attempt will perform the following statistical analysis listed below.<br />
■Statistical Analysis:<br />
○Note: at the time of analysis, we will perform the Shapiro–Wilk test and generate a quantile–quantile plot to assess the normality of the data. We will also perform Levene’s test to assesshomoscedasticity. If the data appear skewed we will perform the appropriate transformation inorder to proceed with the proposed statistical analysis. If this is not possible we will perform theequivalent non-parametric test.<br />

○Unpaired two-tailed t-test of Day 5 absorbance of pCMV transfected cells compared to pCMV/PTEN3′UTR transfected cells

In [62]:
df=pd.read_csv("./study_34/data/Study_34_Protocol_2.csv")
print(df)
df = df[df.mouse_id!='ctrl'] #exclude control
print(df)

                             Contents mouse_id  treatment strain  cohort   
0                 Nras D12 Chrt1 M#57       57  G12V/D38A  BL6WT     1.0  \
1                 Nras D12 Chrt1 M#58       58  G12V/D38A  BL6WT     1.0   
2                 Nras D12 Chrt1 M#60       60  G12V/D38A  BL6WT     1.0   
3                 Nras D12 Chrt1 M#62        2       G12V  BL6WT     1.0   
4                 Nras D12 Chrt1 M#63       63       G12V  BL6WT     1.0   
5                 Nras D12 Chrt1 M#72       72  G12V/D38A    CD4     1.0   
6                 Nras D12 Chrt1 M#73       73  G12V/D38A    CD4     1.0   
7                 Nras D12 Chrt1 M#74       74       G12V    CD4     1.0   
8                Nras D12 Chrt2 M#100      100       G12V    CD4     2.0   
9                Nras D12 Chrt2 M#101      101       G12V    CD4     2.0   
10               Nras D12 Chrt2 M#102      102  G12V/D38A    CD4     2.0   
11               Nras D12 Chrt2 M#103      103  G12V/D38A    CD4     2.0   
12  Nras D12

In [63]:
y, X = dmatrices('percent_positive ~ strain + treatment', data=df, 
                 return_type='dataframe')

In [64]:
print(y)

    percent_positive
0               0.01
1               0.44
2               0.01
3               0.09
4               0.08
5               0.19
6               0.19
7               0.48
8               0.01
9               0.05
10              0.01
11              0.15
14              0.53
15              0.12
16              0.20
17              0.01
18              0.01
19              0.03
20              0.00
21              0.01


In [65]:
print(X)

    Intercept  strain[T.CD4]  treatment[T.G12V/D38A]
0         1.0            0.0                     1.0
1         1.0            0.0                     1.0
2         1.0            0.0                     1.0
3         1.0            0.0                     0.0
4         1.0            0.0                     0.0
5         1.0            1.0                     1.0
6         1.0            1.0                     1.0
7         1.0            1.0                     0.0
8         1.0            1.0                     0.0
9         1.0            1.0                     0.0
10        1.0            1.0                     1.0
11        1.0            1.0                     1.0
14        1.0            0.0                     1.0
15        1.0            0.0                     1.0
16        1.0            0.0                     0.0
17        1.0            0.0                     0.0
18        1.0            0.0                     0.0
19        1.0            1.0                  

In [66]:
# Performing two-way ANOVA
model = ols(
    'percent_positive ~ C(treatment) + C(strain)', data=df).fit()
sm.stats.anova_lm(model, typ=2)
print(model.summary())
anova_table = sm.stats.anova_lm(model, typ=2)

                            OLS Regression Results                            
Dep. Variable:       percent_positive   R-squared:                       0.065
Model:                            OLS   Adj. R-squared:                 -0.045
Method:                 Least Squares   F-statistic:                    0.5944
Date:                Fri, 05 May 2023   Prob (F-statistic):              0.563
Time:                        14:41:22   Log-Likelihood:                 8.6150
No. Observations:                  20   AIC:                            -11.23
Df Residuals:                      17   BIC:                            -8.243
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

In [67]:
sm.stats.anova_lm(model, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
C(treatment),0.02738,1.0,0.940741,0.345686
C(strain),0.00722,1.0,0.24807,0.624819
Residual,0.49478,17.0,,


In [68]:
# Performing two-way ANOVA#
#model = ols(
#    'percent_positive ~ C(treatment) + C(strain) + C(treatment):C(strain)', data=data).fit()
#sm.stats.anova_lm(model, typ=2)

Planned comparisons with the Bonferroni correction.
a. The percent of Nras-positive cells in wild-type mice injected with NrasG12V compared to the
percent of Nras-positive cells in wild-type mice injected with NrasG12V/D38A.

In [69]:
#get df only WT
df_wt = df[df.strain=='BL6WT'] #exclude control
print(df_wt)

               Contents mouse_id  treatment strain  cohort  percent_positive   
0   Nras D12 Chrt1 M#57       57  G12V/D38A  BL6WT     1.0              0.01  \
1   Nras D12 Chrt1 M#58       58  G12V/D38A  BL6WT     1.0              0.44   
2   Nras D12 Chrt1 M#60       60  G12V/D38A  BL6WT     1.0              0.01   
3   Nras D12 Chrt1 M#62        2       G12V  BL6WT     1.0              0.09   
4   Nras D12 Chrt1 M#63       63       G12V  BL6WT     1.0              0.08   
14  Nras D12 Chrt2 M#65       65  G12V/D38A  BL6WT     2.0              0.53   
15  Nras D12 Chrt2 M#66       66  G12V/D38A  BL6WT     2.0              0.12   
16  Nras D12 Chrt2 M#67       67       G12V  BL6WT     2.0              0.20   
17  Nras D12 Chrt2 M#68       68       G12V  BL6WT     2.0              0.01   
18  Nras D12 Chrt2 M#70       70       G12V  BL6WT     2.0              0.01   

    positive_count  negative_count      Area  
0                1           10588  2.482741  
1               57       

In [70]:
#Performing one-way ANOVA
model_a = ols(
    'percent_positive ~ C(treatment)', data=df).fit()
sm.stats.anova_lm(model_a, typ=1)
print(model_a.summary())
anova_table_a = sm.stats.anova_lm(model_a, typ=1)

                            OLS Regression Results                            
Dep. Variable:       percent_positive   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.9818
Date:                Fri, 05 May 2023   Prob (F-statistic):              0.335
Time:                        14:41:23   Log-Likelihood:                 8.4701
No. Observations:                  20   AIC:                            -12.94
Df Residuals:                      18   BIC:                            -10.95
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

In [71]:
sm.stats.anova_lm(model_a, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
C(treatment),0.02738,1.0,0.981753,0.3349
Residual,0.502,18.0,,


In [72]:
#With Bonferroni correction with standard alpha from statsmodels
statsmodels.stats.multitest.multipletests([0.3349], alpha=0.05, method='bonferroni', is_sorted=False, returnsorted=False)

(array([False]), array([0.3349]), 0.050000000000000044, 0.05)

b. The percent of Nras-positive cells in wild-type mice injected with NrasG12V compared to the
percent of Nras-positive cells in CD4−/− mice injected with NrasG12V.

In [73]:
#get df treatment NrasG12V
df_g12v = df[df.treatment=='G12V'] #exclude control
print(df_g12v)

                Contents mouse_id treatment strain  cohort  percent_positive   
3    Nras D12 Chrt1 M#62        2      G12V  BL6WT     1.0              0.09  \
4    Nras D12 Chrt1 M#63       63      G12V  BL6WT     1.0              0.08   
7    Nras D12 Chrt1 M#74       74      G12V    CD4     1.0              0.48   
8   Nras D12 Chrt2 M#100      100      G12V    CD4     2.0              0.01   
9   Nras D12 Chrt2 M#101      101      G12V    CD4     2.0              0.05   
16   Nras D12 Chrt2 M#67       67      G12V  BL6WT     2.0              0.20   
17   Nras D12 Chrt2 M#68       68      G12V  BL6WT     2.0              0.01   
18   Nras D12 Chrt2 M#70       70      G12V  BL6WT     2.0              0.01   
20   Nras D12 Chrt2 M#97       97      G12V    CD4     2.0              0.00   
21   Nras D12 Chrt2 M#98       98      G12V    CD4     2.0              0.01   

    positive_count  negative_count      Area  
3               11           12155  2.333911  
4               10       

In [74]:
#Performing one-way ANOVA
model_b = ols(
    'percent_positive ~ C(strain)', data=df).fit()
sm.stats.anova_lm(model_b, typ=1)
print(model_b.summary())
anova_table_b = sm.stats.anova_lm(model_b, typ=1)

                            OLS Regression Results                            
Dep. Variable:       percent_positive   R-squared:                       0.014
Model:                            OLS   Adj. R-squared:                 -0.041
Method:                 Least Squares   F-statistic:                    0.2489
Date:                Fri, 05 May 2023   Prob (F-statistic):              0.624
Time:                        14:41:26   Log-Likelihood:                 8.0764
No. Observations:                  20   AIC:                            -12.15
Df Residuals:                      18   BIC:                            -10.16
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.1500      0.054  

In [75]:
sm.stats.anova_lm(model_b, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
C(strain),0.00722,1.0,0.248889,0.6239
Residual,0.52216,18.0,,


In [76]:
#With Bonferroni correction with standard alpha from statsmodels
statsmodels.stats.multitest.multipletests([0.6239], alpha=0.05, method='bonferroni', is_sorted=False, returnsorted=False)

(array([False]), array([0.6239]), 0.050000000000000044, 0.05)

In [77]:
f = open('./environment.yml', 'r')
content = f.read()
print(content)

name: reproducibility_hackathon
channels:
  - conda-forge
  - defaults
dependencies:
  - _libgcc_mutex=0.1=conda_forge
  - _openmp_mutex=4.5=2_gnu
  - bzip2=1.0.8=h7f98852_4
  - ca-certificates=2022.12.7=ha878542_0
  - ld_impl_linux-64=2.40=h41732ed_0
  - libexpat=2.5.0=hcb278e6_1
  - libffi=3.4.2=h7f98852_5
  - libgcc-ng=12.2.0=h65d4601_19
  - libgomp=12.2.0=h65d4601_19
  - libnsl=2.0.0=h7f98852_0
  - libsqlite=3.40.0=h753d276_1
  - libuuid=2.38.1=h0b41bf4_0
  - libzlib=1.2.13=h166bdaf_4
  - ncurses=6.3=h27087fc_1
  - openssl=3.1.0=hd590300_3
  - pip=23.1.2=pyhd8ed1ab_0
  - python=3.11.3=h2755cc3_0_cpython
  - readline=8.2=h8228510_1
  - setuptools=67.7.2=pyhd8ed1ab_0
  - tk=8.6.12=h27826a3_0
  - wheel=0.40.0=pyhd8ed1ab_0
  - xz=5.2.6=h166bdaf_0
  - pip:
      - anyio==3.6.2
      - argon2-cffi==21.3.0
      - argon2-cffi-bindings==21.2.0
      - arrow==1.2.3
      - asttokens==2.2.1
      - attrs==23.1.0
      - backcall==0.2.0
      - beautifulsoup4==4.12.2
      - bleach==6.0.0
   