# 19302 human sequence proteome of Ensembl 92 canonical transcripts 

This notebook contains the code to generate the 19302 human sequence proteome of Ensembl 92 canonical transcripts (downloaded using Biomart). The canonical transcripts have been further filtered to be unique, meaning not including those belonging to scaffolds (only standard chromosomes).

With this approach, we aim to reduce to the maximum the presence of replicated IDs, specifically of symbols mapped to more than one ENSG-ENST-ENSP trio. In case any replicate remains, that protein will be not analyzed in the functional validation. 

This is the proteome used for degrons functional analysis.

## Import libraries

In [1]:
# to reload automatically the changes in the scripts.
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import pandas as pd
import gzip

## my modules ##
sys.path.append("../scripts/Utils/")    # modules folder
from fasta_utils import readFasta_header_gzip

## Define variables and paths

In [10]:
# paths
data_path = "../data/"
external_data_path = "external/"
results_path = "../results/"
logs_path = "../logs/"
tests_path = "../tests/"
cluster_data_path = "/workspace/projects/degrons/data/"
others_data_path = "others/"

# files names
biomart1_file = "ensembl_canonical_transcripts_uniq.tsv"
biomart2_file = "biomart/biomart92_proteome_cantranscripts_uniq.txt.gz"
cantranscripts_uniq_enst = "ensembl_canonical_transcripts_uniq_ENSTs.txt"
proteome_file = "biomart92_proteome_cantranscripts_uniq_prepro.fasta"
proteome_file_gz = "biomart92_proteome_cantranscripts_uniq_prepro.fasta.gz"

In [1]:
base = "../"

data = "data/"

uniq_can_transcripts_path = os.path.join(base, data, "external/ensembl_canonical_transcripts_uniq.tsv")
uniq_can_transcripts_enst_path = os.path.join(base, data, "external/ensembl_canonical_transcripts_uniq_ENSTs.txt")

## 1. Generate a list of canonical ENSTs belonging to standard chromosomes (Biomart input)

*ENST = Ensembl Transcript ID

In [4]:
canonical_transcripts_uniq = pd.read_csv(uniq_can_transcripts_path, sep = "\t")

In [5]:
canonical_transcripts_uniq

Unnamed: 0,ensg,enst,symbol,chr
0,ENSG00000013503,ENST00000228347,POLR3B,12
1,ENSG00000136044,ENST00000551662,APPL2,12
2,ENSG00000136051,ENST00000620430,WASHC4,12
3,ENSG00000151967,ENST00000445224,SCHIP1,3
4,ENSG00000166535,ENST00000299698,A2ML1,12
...,...,...,...,...
19297,ENSG00000125780,ENST00000381458,TGM3,20
19298,ENSG00000268104,ENST00000598581,SLC6A14,X
19299,ENSG00000204227,ENST00000374656,RING1,6
19300,ENSG00000040275,ENST00000265295,SPDL1,5


In [8]:
canonical_transcripts_uniq.chr.unique()

array(['12', '3', '1', '4', '5', '8', '15', '14', '22', '17', '21', '6',
       '13', '2', '11', '7', '18', '19', '20', '16', '9', '10', 'X', 'Y'],
      dtype=object)

In [11]:
canonical_transcripts_uniq["enst"].to_csv(uniq_can_transcripts_enst_path, index = False, header = False)

## 2. Processed Biomart 92 proteome containing standard canonical transcripts only

- List of canonical transcriptsm, ENSTs only: `../data/external/biomart/ensembl_canonical_transcripts_uniq_ENSTs.txt`.
- Biomart specifications:
    - Database: Ensembl Genes 92
    - Dataset: Human genes (GRCh38.p12)
    - Filters: GENE -> Input external references ID list -> Transcript stable ID -> upload list of canonical transcripts belonging to standard chroms (`../data/external/biomart/ensembl_canonical_transcripts_uniq_ENSTs.txt`)
    - Attributes (in Sequences): Gene stable ID, Transcript stable ID, Protein stable ID, Gene name, Peptide. 
- Resulting file: `../data/external/biomart/biomart92_proteome_cantranscripts.txt.gz`.

In [4]:
proteome = readFasta_header_gzip(data_path+external_data_path+biomart2_file)

# Expected number of read sequences: 19302 (Count functionality in Biomart)

Number of retrieved sequences: 19302



Note that the peptides downloaded from Biomart contain an asterisk at the end of the sequence to indicate when an aminoacid was translated from a STOP codon (I am almost sure Biomart performs an *in silico* translation of the trascripts). This asterisks have no interest for downstream analysis, so I remove them with the function `readFasta_header_gzip`.

In [5]:
# check * have been removed

proteome["ENSG00000006611|ENST00000005226|ENSP00000005226|USH1C|Q9Y6N9"]

'MDRKVAREFRHKVDFLIENDAEKDYLYDVLRMYHQTMDVAVLVGDLKLVINEPSRLPLFDAIRPLIPLKHQVEYDQLTPRRSRKLKEVRLDRLHPEGLGLSVRGGLEFGCGLFISHLIKGGQADSVGLQVGDEIVRINGYSISSCTHEEVINLIRTKKTVSIKVRHIGLIPVKSSPDEPLTWQYVDQFVSESGGVRGSLGSPGNRENKEKKVFISLVGSRGLGCSISSGPIQKPGIFISHVKPGSLSAEVGLEIGDQIVEVNGVDFSNLDHKEAVNVLKSSRSLTISIVAAAGRELFMTDRERLAEARQRELQRQELLMQKRLAMESNKILQEQQEMERQRRKEIAQKAAEENERYRKEMEQIVEEEEKFKKQWEEDWGSKEQLLLPKTITAEVHPVPLRKPKSFGWFYRYDGKFPTIRKKGKDKKKAKYGSLQDLRKNKKELEFEQKLYKEKEEMLEKEKQLKINRLAQEVSETEREDLEESEKIQYWVERLCQTRLEQISSADNEISEMTTGPPPPPPSVSPLAPPLRRFAGGLHLHTTDLDDIPLDMFYYPPKTPSALPVMPHPPPSNPPHKVPAPPVLPLSGHVSASSSPWVQRTPPPIPIPPPPSVPTQDLTPTRPLPSALEEALSNHPFRTGDTGNPVEDWEAKNHSGKPTNSPVPEQSFPPTPKTFCPSPQPPRGPGVSTISKPVMVHQEPNFIYRPAVKSEVLPQEMLKRMVVYQTAFRQDFRKYEEGFDPYSMFTPEQIMGKDVRLLRIKKEGSLDLALEGGVDSPIGKVVVSAVYERGAAERHGGIVKGDEIMAINGKIVTDYTLAEAEAALQKAWNQGGDWIDLVVAVCPPKEYDDELASLPSSVAESPQPVRKLLEDRAAVHRHGFLLQLEPTDLLLKSKRGNQIHR'

In [9]:
# with open(data_path+others_data_path+proteome_file, "w") as f:
    
#     for header in proteome.keys():
        
#         f.write(">"+header+"\n")
#         f.write(proteome[header]+"\n")
        
        

In [12]:
# check if the proteome has been saved properly

proteome = readFasta_header_gzip(data_path+others_data_path+proteome_file_gz)

Number of retrieved sequences: 19302



In [13]:
proteome["ENSG00000006611|ENST00000005226|ENSP00000005226|USH1C|Q9Y6N9"] # correct

'MDRKVAREFRHKVDFLIENDAEKDYLYDVLRMYHQTMDVAVLVGDLKLVINEPSRLPLFDAIRPLIPLKHQVEYDQLTPRRSRKLKEVRLDRLHPEGLGLSVRGGLEFGCGLFISHLIKGGQADSVGLQVGDEIVRINGYSISSCTHEEVINLIRTKKTVSIKVRHIGLIPVKSSPDEPLTWQYVDQFVSESGGVRGSLGSPGNRENKEKKVFISLVGSRGLGCSISSGPIQKPGIFISHVKPGSLSAEVGLEIGDQIVEVNGVDFSNLDHKEAVNVLKSSRSLTISIVAAAGRELFMTDRERLAEARQRELQRQELLMQKRLAMESNKILQEQQEMERQRRKEIAQKAAEENERYRKEMEQIVEEEEKFKKQWEEDWGSKEQLLLPKTITAEVHPVPLRKPKSFGWFYRYDGKFPTIRKKGKDKKKAKYGSLQDLRKNKKELEFEQKLYKEKEEMLEKEKQLKINRLAQEVSETEREDLEESEKIQYWVERLCQTRLEQISSADNEISEMTTGPPPPPPSVSPLAPPLRRFAGGLHLHTTDLDDIPLDMFYYPPKTPSALPVMPHPPPSNPPHKVPAPPVLPLSGHVSASSSPWVQRTPPPIPIPPPPSVPTQDLTPTRPLPSALEEALSNHPFRTGDTGNPVEDWEAKNHSGKPTNSPVPEQSFPPTPKTFCPSPQPPRGPGVSTISKPVMVHQEPNFIYRPAVKSEVLPQEMLKRMVVYQTAFRQDFRKYEEGFDPYSMFTPEQIMGKDVRLLRIKKEGSLDLALEGGVDSPIGKVVVSAVYERGAAERHGGIVKGDEIMAINGKIVTDYTLAEAEAALQKAWNQGGDWIDLVVAVCPPKEYDDELASLPSSVAESPQPVRKLLEDRAAVHRHGFLLQLEPTDLLLKSKRGNQIHR'

## 3. Check replicates in the headers IDs

To do this, we generate a dataframe with the information in the proteins headers:

In [14]:
list_ids = []

for key in proteome.keys():
    ids = key.split("|")  
    list_ids.append(ids)

In [15]:
colnames = ["ENSG", "ENST", "ENSP", "SYMBOL", "UNIPROT"]
df_ids = pd.DataFrame(list_ids, columns = colnames)
df_ids

Unnamed: 0,ENSG,ENST,ENSP,SYMBOL,UNIPROT
0,ENSG00000004059,ENST00000000233,ENSP00000000233,ARF5,P84085
1,ENSG00000006611,ENST00000005226,ENSP00000005226,USH1C,Q9Y6N9
2,ENSG00000002587,ENST00000002596,ENSP00000002596,HS3ST1,O14792
3,ENSG00000001617,ENST00000002829,ENSP00000002829,SEMA3F,Q13275
4,ENSG00000004799,ENST00000005178,ENSP00000005178,PDK4,Q16654
...,...,...,...,...,...
19297,ENSG00000070182,ENST00000644917,ENSP00000495909,SPTB,
19298,ENSG00000185811,ENST00000644005,ENSP00000496453,IKZF1,
19299,ENSG00000145808,ENST00000638972,ENSP00000491408,ADAMTS19,Q8TE59
19300,ENSG00000102974,ENST00000646076,ENSP00000494538,CTCF,


In [16]:
def check_reps(df, colname, nickname):
    """
    Counts the number of times a value is repeated in a column
    and counts and displays those values repeated two times or more
    """
    
    counts = df[colname].value_counts(ascending=True)
    reps = counts[counts > 1]
    print(f'Found {len(reps[reps > 1])} {nickname} replicates')
    
    return reps

In [17]:
print(check_reps(df_ids, "ENSG", "ENSG"))
print()
print(check_reps(df_ids, "ENST", "ENST"))
print()
print(check_reps(df_ids, "ENSP", "ENSP"))
print()
print(check_reps(df_ids, "SYMBOL", "SYMBOL"))
print()
print(check_reps(df_ids, "UNIPROT", "UNIPROT"))

Found 0 ENSG replicates
Series([], Name: ENSG, dtype: int64)

Found 0 ENST replicates
Series([], Name: ENST, dtype: int64)

Found 0 ENSP replicates
Series([], Name: ENSP, dtype: int64)

Found 8 SYMBOL replicates
TBCE        2
ATXN7       2
ABCF2       2
POLR2J3     2
FAM212B     2
PRSS50      2
TXNRD3NB    2
DIABLO      2
Name: SYMBOL, dtype: int64

Found 70 UNIPROT replicates
P84243     2
Q6F5E7     2
Q9UBD0     2
Q8IVW1     2
Q9BQ83     2
          ..
Q9ULZ0     6
Q0WX57     7
P68431    10
Q5JQC4    12
P62805    14
Name: UNIPROT, Length: 70, dtype: int64
