In [1]:
  ! pip install biopython

Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 4.4 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79


Obtain the raw data

In [2]:
! wget 'https://github.com/sonluongvu/Palm_structure/raw/main/Mus_musculus/Mus_musculus_info.csv'

--2022-01-24 23:43:37--  https://github.com/sonluongvu/Palm_structure/raw/main/Mus_musculus/Mus_musculus_info.csv
Resolving github.com (github.com)... 52.69.186.44
Connecting to github.com (github.com)|52.69.186.44|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/sonluongvu/Palm_structure/main/Mus_musculus/Mus_musculus_info.csv [following]
--2022-01-24 23:43:37--  https://raw.githubusercontent.com/sonluongvu/Palm_structure/main/Mus_musculus/Mus_musculus_info.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 248585 (243K) [text/plain]
Saving to: ‘Mus_musculus_info.csv’


2022-01-24 23:43:38 (7.62 MB/s) - ‘Mus_musculus_info.csv’ saved [248585/248585]



Open the raw data

In [3]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align import MultipleSeqAlignment
from Bio.Align import AlignInfo
from Bio import AlignIO
from Bio.Align.Applications import MuscleCommandline

In [4]:
df_path = '/content/Mus_musculus_info.csv'
df = pd.read_csv(df_path)
df.shape

(254, 4)

Import libraries

Extract peptides 100 aa upstream and downstream of palmitoylated C

In [5]:
peptide_record = []
upstream_peptide_record = []
downstream_peptide_record = []

for index, row in df.iterrows():
  pos = row['pos']
  start = pos - 1 + 200 - 100
  end = pos - 1 + 200 + 101

  original_seq = row['protein_seq']
  seq_id = row['id']
  
  peptide_sequence = Seq(original_seq[start:end])
  upstream_seq = Seq(original_seq[start:(pos-1+200)])
  downstream_seq = Seq(original_seq[(pos+200):end])

  record = SeqRecord(peptide_sequence, id = seq_id)
  upstream_record = SeqRecord(upstream_seq, id = seq_id)
  downstream_record = SeqRecord(downstream_seq, id = seq_id)

  peptide_record.append(record)
  upstream_peptide_record.append(upstream_record)
  downstream_peptide_record.append(downstream_record)

Create the sequence alginment

In [6]:
alignment = MultipleSeqAlignment(peptide_record)
AlignIO.write(alignment, 'alignments.fasta','fasta')

upstream_alignment = MultipleSeqAlignment(upstream_peptide_record)
AlignIO.write(upstream_alignment, 'upstream_alignments.fasta','fasta')

downstream_alignment = MultipleSeqAlignment(downstream_peptide_record)
AlignIO.write(downstream_alignment, 'downstream_alignments.fasta','fasta')

1

In [7]:
alignment_info = AlignInfo.SummaryInfo(alignment)
alignment_pssm = alignment_info.pos_specific_score_matrix()

Create dataframe from the pssm

In [8]:
aa_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
pssm_df = pd.DataFrame()
for aa in aa_list:
  column_value =[]
  for row in range(201):
    value = alignment_pssm[row][aa]
    column_value.append(value)
  pssm_df[aa] = column_value
pssm_df.to_csv('pssm.csv', index = False)