In [1]:
import pandas as pd
import numpy as np

In [2]:
data_abund = pd.read_csv("9606_abund.txt", sep="\t")

In [3]:
data_dom = pd.read_csv("9606_gn_dom.txt", sep="\t")

In [4]:
data_abund = data_abund.rename(columns={"#Taxid": "Taxid"})
data_dom = data_dom.rename(columns={"#Gn": "Gn"})

In [5]:
data_abund.head(10)

Unnamed: 0,Taxid,Ensembl_protein,Gn,Mean-copy-number
0,9606,ENSP00000263100,A1BG,885.188
1,9606,ENSP00000282641,A1CF,19.016
2,9606,ENSP00000282641,A1CF,19.016
3,9606,ENSP00000282641,A1CF,19.016
4,9606,ENSP00000323929,A2M,1114.564
5,9606,ENSP00000323929,A2M,1114.564
6,9606,ENSP00000323929,A2M,1114.564
7,9606,ENSP00000323929,A2M,1114.564
8,9606,ENSP00000299698,A2ML1,90.762
9,9606,ENSP00000299698,A2ML1,90.762


In [6]:
data_dom.head(10)

Unnamed: 0,Gn,Domain,Start,End,Eval
0,A1BG,Ig,127,201,0.38
1,A1BG,Ig,217,300,3e-15
2,A1BG,Ig,31,110,8.2e-06
3,A1BG,Ig,403,490,0.0019
4,A1BG,SpaA,327,352,44.0
5,A1CF,DND1_DSRM,447,523,2.3e-24
6,A1CF,RRM,138,199,4.4e-07
7,A1CF,RRM,233,296,6.7e-11
8,A1CF,RRM,58,124,2.4e-16
9,A2M,A2M,738,828,4.5e-31


## A1

### How many protein/copy-number pairs are in the file? (Single numerical value)

In [7]:
data_abund.shape[0]

53642

### How many unique copy number values are there in the file?

In [8]:
data_abund[["Mean-copy-number", "Gn"]].drop_duplicates().shape[0]

19567

In [76]:
find_isoforms = data_abund.drop_duplicates().groupby(by=["Gn", "Mean-copy-number"])['Ensembl_protein'].nunique().reset_index(name="uniq_values")

In [77]:
# what proteins have more than one Ensembl number
find_isoforms[find_isoforms['uniq_values'] > 1]['Gn'].to_list()

['CSH1', 'DAZ2', 'NPIPA3', 'NPIPB4', 'ZNF84']

In [78]:
# it is likely these proteins have isoforms

### How many pairs of protein and copy number values are in the file? (Single numerical value)

In [9]:
data_abund.drop_duplicates().shape[0]

19572

## A2. Compute the mean and standard deviation of copy numbers for all proteins (considering unique pairs only) first as a single number for all proteins (two numerical values) and then for each protein separately (Table in tsv/csv).

In [10]:
#drop duplicates
unique_copy_num = data_abund.drop_duplicates().copy()
unique_copy_num.shape

(19572, 4)

In [11]:
unique_copy_num.isna().sum()

Taxid               0
Ensembl_protein     0
Gn                  0
Mean-copy-number    0
dtype: int64

In [12]:
unique_copy_num["Mean-copy-number"].describe()

count     19572
unique    16241
top       0.000
freq         39
Name: Mean-copy-number, dtype: object

In [14]:
#convert format to numerical
unique_copy_num["Mean-copy-number"] = pd.to_numeric(unique_copy_num["Mean-copy-number"], errors="coerce")
mean_value = unique_copy_num["Mean-copy-number"].mean()

In [15]:
std_value = unique_copy_num["Mean-copy-number"].std()

In [16]:
print(f"Mean copy number: {mean_value:.3f}")
print(f"Standard Deviation copy number: {std_value:.3f}")

Mean copy number: 79.826
Standard Deviation copy number: 362.170


In [17]:
mean_std_for_each_protein = unique_copy_num.groupby("Gn")["Mean-copy-number"].agg(["mean", "std"]).reset_index()

In [18]:
mean_std_for_each_protein = mean_std_for_each_protein.fillna(0)

In [19]:
mean_std_for_each_protein.count()

Gn      18992
mean    18992
std     18992
dtype: int64

In [20]:
mean_std_for_each_protein.isna().sum()

Gn      0
mean    0
std     0
dtype: int64

In [21]:
mean_std_for_each_protein.sort_values("mean", ascending=False).head(10)

Unnamed: 0,Gn,mean,std
531,ALB,22306.386,0.0
6853,HBA2,14178.655,0.0
6854,HBB,13538.518,0.0
8428,LALBA,12454.998,0.0
16847,TMSB4X,11622.015,0.0
7434,IGLC1,7235.736,0.0
7435,IGLJ1,7235.736,0.0
7437,IGLL5,7235.736,0.0
3682,CSN1S1,7160.29,0.0
6037,GAPDH,6999.011,0.0


In [22]:
mean_std_for_each_protein.sort_values("mean", ascending=False).to_csv("A2task_mean_std_proteins_abund.csv", index=False)

### A3. Calculate the percentile rank (in terms of average copy number ranks) for each protein. (i.e. for protein X, where is it in the ranks from top (0%) to bottom (100%) in terms of abundance) (Table in csv/tsv). Please also give the top ten proteins (highest abundance) as a list with the associated numerical values.

In [25]:
mean_std_for_each_protein["percentile_rank"] = mean_std_for_each_protein["mean"].rank(pct=True, ascending=True) * 100

In [26]:
mean_std_for_each_protein.sort_values("percentile_rank", ascending=False).to_csv("A3task_mean_std_rank.csv")

In [27]:
top_10 = mean_std_for_each_protein.sort_values("percentile_rank", ascending=False).head(10)

In [28]:
top_10

Unnamed: 0,Gn,mean,std,percentile_rank
531,ALB,22306.386,0.0,100.0
6853,HBA2,14178.655,0.0,99.994735
6854,HBB,13538.518,0.0,99.989469
8428,LALBA,12454.998,0.0,99.984204
16847,TMSB4X,11622.015,0.0,99.978939
7434,IGLC1,7235.736,0.0,99.968408
7435,IGLJ1,7235.736,0.0,99.968408
7437,IGLL5,7235.736,0.0,99.968408
3682,CSN1S1,7160.29,0.0,99.957877
6037,GAPDH,6999.011,0.0,99.952612


In [26]:
top_10[["Gn", "mean"]].values.tolist()

[['ALB', 22306.386],
 ['HBA2', 14178.655],
 ['HBB', 13538.518],
 ['LALBA', 12454.998],
 ['TMSB4X', 11622.015],
 ['IGLC1', 7235.736],
 ['IGLJ1', 7235.736],
 ['IGLL5', 7235.736],
 ['CSN1S1', 7160.29],
 ['GAPDH', 6999.011]]

# Analyse protein domains

### B1. What is the domain with the highest average abundance (i.e. across all copies of the domain in all proteins) and what is the value of the average abundance, and how many times was the domain seen? (single string value and two numerical values)

In [29]:
data_dom.head(5)

Unnamed: 0,Gn,Domain,Start,End,Eval
0,A1BG,Ig,127,201,0.38
1,A1BG,Ig,217,300,3e-15
2,A1BG,Ig,31,110,8.2e-06
3,A1BG,Ig,403,490,0.0019
4,A1BG,SpaA,327,352,44.0


In [30]:
data_dom.isna().sum()

Gn        0
Domain    0
Start     0
End       0
Eval      0
dtype: int64

In [31]:
data_dom.shape

(65884, 5)

In [32]:
data_dom.drop_duplicates().shape

(65877, 5)

In [33]:
data_dom = data_dom.drop_duplicates()

In [34]:
proteins_domains = data_dom.merge(mean_std_for_each_protein[["Gn", "mean", "std"]], on="Gn", how="left")

In [35]:
# domain average abundance (i.e. across all copies of the domain in all proteins)
count_domain = proteins_domains.groupby(["Gn","Domain","mean","std"])["Domain"].size().reset_index(name="count_domain")

In [36]:
count_domain

Unnamed: 0,Gn,Domain,mean,std,count_domain
0,A1BG,Ig,885.188,0.0,4
1,A1BG,SpaA,885.188,0.0,1
2,A1CF,DND1_DSRM,19.016,0.0,1
3,A1CF,RRM,19.016,0.0,3
4,A2M,A2M,1114.564,0.0,1
...,...,...,...,...,...
32509,ZZEF1,CUB,17.443,0.0,1
32510,ZZEF1,EF-hand,17.443,0.0,2
32511,ZZEF1,ZZ,17.443,0.0,2
32512,ZZZ3,Myb_DNA-binding,7.258,0.0,1


In [37]:
most_prevalent_domain = count_domain.groupby("Domain").agg({
    "mean": "mean",
    "count_domain": "sum"
}).reset_index().sort_values("mean", ascending=False).head(1)

In [38]:
domain_name = most_prevalent_domain["Domain"].iloc[0]
avg_abundance = most_prevalent_domain["mean"].iloc[0]
times_seen = most_prevalent_domain["count_domain"].iloc[0]

print(f"domain with the highest average abundance"
      f"(i.e. across all copies of the domain in all proteins): {domain_name}, {round(avg_abundance, 2)}, {times_seen}")

domain with the highest average abundance(i.e. across all copies of the domain in all proteins): Serum_albumin, 5947.1, 11


In [39]:
# top 10 abundant domains
count_domain.groupby("Domain").agg({
    "mean": "mean",
    "count_domain": "sum"
}).reset_index().sort_values("mean", ascending=False).head(10)

Unnamed: 0,Domain,mean,count_domain
5284,Serum_albumin,5947.10225,11
942,Casein_kappa,4953.42,1
373,ApoC-I,4531.031,1
3018,Keratin_2_tail,3745.121,1
371,ApoA-II,3707.405,3
2592,Gp_dh_N,3523.1575,2
2591,Gp_dh_C,3523.1575,2
5823,Transthyretin,3512.367,1
941,Casein,2989.446,1
5765,Thymosin,2785.3874,5


### B2. Compute the mean and standard deviation of domain average abundance for each protein domain (i.e. by summing abundance values of all versions of these domains) by combining these two files also, compute the percentile rank values as above (One table)

### To calculate Std we need to use propagation of error formula:

$
\sigma = \sqrt{\sigma_1^2 + \sigma_2^2 + \dots + \sigma_n^2}
$

In [40]:
std_count = (
    count_domain.assign(std_squared=lambda d: d["std"]**2)
    .groupby("Domain")["std_squared"]
    .sum()
    .apply(np.sqrt)
    .reset_index()
    .rename(columns={"std_squared": "domain_abundance_std"})
)

In [41]:
domain_summary = count_domain.groupby("Domain").agg({
    "mean": "mean",
    "count_domain": "sum"
}).reset_index().sort_values("mean", ascending=False)

In [42]:
avg_abund_each_domain = domain_summary.merge(std_count, on="Domain")

### mean and standard deviation of domain average abundance for each protein domain:

In [43]:
avg_abund_each_domain

Unnamed: 0,Domain,mean,count_domain,domain_abundance_std
0,Serum_albumin,5947.10225,11,0.0
1,Casein_kappa,4953.42000,1,0.0
2,ApoC-I,4531.03100,1,0.0
3,Keratin_2_tail,3745.12100,1,0.0
4,ApoA-II,3707.40500,3,0.0
...,...,...,...,...
6414,DUF4562,0.00100,1,0.0
6415,DUF5540,0.00000,1,0.0
6416,DUF4796,0.00000,1,0.0
6417,DUF4521,0.00000,1,0.0


In [50]:
# write to .csv file:
# avg_abund_each_domain.to_csv("avg_abundance_protein_domains.csv", index=False)

### compute the percentile rank values as above (One table)

In [46]:
avg_abund_each_domain["percentile_rank"] = avg_abund_each_domain["mean"].rank(pct=True, ascending=True) * 100

In [47]:
avg_abund_each_domain_ranks = avg_abund_each_domain.sort_values("percentile_rank", ascending=False)

In [48]:
avg_abund_each_domain_ranks = avg_abund_each_domain_ranks[["Domain", "mean", "domain_abundance_std",
    "count_domain", "percentile_rank"
                                                          ]]
avg_abund_each_domain_ranks.head(10)

Unnamed: 0,Domain,mean,domain_abundance_std,count_domain,percentile_rank
0,Serum_albumin,5947.10225,0.0,11,100.0
1,Casein_kappa,4953.42,0.0,1,99.984421
2,ApoC-I,4531.031,0.0,1,99.968842
3,Keratin_2_tail,3745.121,0.0,1,99.953264
4,ApoA-II,3707.405,0.0,3,99.937685
5,Gp_dh_N,3523.1575,0.0,2,99.914317
6,Gp_dh_C,3523.1575,0.0,2,99.914317
7,Transthyretin,3512.367,0.0,1,99.890949
8,Casein,2989.446,0.0,1,99.87537
9,Thymosin,2785.3874,25.710403,5,99.859791


In [49]:
# write to .csv file:
avg_abund_each_domain_ranks.to_csv("B2task_avg_abund_protein_domains_ranks.csv", index=False)