In [1]:
seq1 = "RATPTRWPVGCFNRPWTKWSYDEALDGIKAAGYAWTGLLTASKPSLHHATATPEYLAALKQKSRHAA"
seq2 = "AAAVMMGLAAIGAAIGIGILGGKFLEGAARQPDLIPLLRTQFFIVMGLVDAIPMIAVGLGLYVMFAVA"
seq3 = "AADVSAAVGATGQSGMTYRLGLSWDWDKSWWQTSTGRLTGYWDAGYTYWEGGDEGAGKHSLSFAPVFVYEFAGDSIKPFIEAGIGVAAFSGTRVGDQNLGSSLNFEDRIGAGLKFANGQSVGVRAIHYSNAGLKQPNDGIESYSLFYKIPI"

# Amino Acid Composition

In [2]:
import pandas as pd

def composition(sequence):
    
    residues = ["G", "A", "V", "L", "I", "P", "F", "Y", "W", "S", "T", "C", "M", "N", "Q", "D", "E", "K", "R", "H"]
    composition = {residue:round(100*sequence.count(residue)/len(sequence),2) for residue in residues}
    
    return composition

comp1 = composition(seq1)
comp2 = composition(seq2)
comp3 = composition(seq3)

df = pd.DataFrame([comp1,comp2,comp3]).transpose()
df.columns = ["Seq. 1", "Seq. 2", "Seq. 3"]
df

Unnamed: 0,Seq. 1,Seq. 2,Seq. 3
G,5.97,14.71,15.23
A,17.91,19.12,10.6
V,1.49,8.82,5.3
L,8.96,13.24,5.96
I,1.49,11.76,5.3
P,7.46,4.41,2.65
F,1.49,5.88,5.3
Y,4.48,1.47,5.3
W,5.97,0.0,3.97
S,5.97,0.0,9.93


# Molecular Weight

In [3]:
def mol_weight(sequence):
    weights = {"A": 85,"C": 115,"D": 130,"E": 145,"F": 160,"G": 70,"W": 200,"H": 150,"I": 125,"K": 145,"L": 125,"M": 143,"N": 130,"Y": 175,"P": 110,"Q": 140,"R": 170,"S": 100,"T": 115,"V": 110}
    weight = 0
    for residue in sequence:
        weight += weights[residue]
    water = 18*(len(sequence) - 1)
    return weight - water


count = 1
for sequence in [seq1,seq2,seq3]:
    print(f"Sequence {count} - {mol_weight(sequence)}")
    count += 1

Sequence 1 - 7127
Sequence 2 - 6529
Sequence 3 - 15453


# Classification

In [4]:
standards = {
    "A": (8.47, 8.95),  
    "D": (5.97, 5.91),  
    "C": (1.39, 0.47),  
    "E": (6.32, 4.78),  
    "T": (5.79, 6.54),  
    "F": (3.91, 3.68), 
    "G": (7.82, 8.54), 
    "H": (2.26, 1.25), 
    "I": (5.71, 4.77),  
    "V": (7.02, 6.76),  
    "K": (5.76, 4.93),  
    "L": (8.48, 8.78),  
    "M": (2.21, 1.56),  
    "N": (4.54, 5.74),  
    "W": (1.44, 1.24),  
    "P": (4.63, 3.74),  
    "Q": (3.82, 4.75),  
    "R": (4.93, 5.24),  
    "S": (5.94, 8.05),  
    "Y": (3.58, 4.13),  
}


def classify(sequence):
    
    group_a_deviation = 0
    group_b_deviation = 0
    
    comp = composition(sequence)
    
    for residue in standards.keys():
        group_a_deviation += abs(comp[residue]-standards[residue][0])
        group_b_deviation += abs(comp[residue]-standards[residue][1])
    
    if group_a_deviation <= group_b_deviation:
        print("The given protein belongs to Group A.")
    else:
        print("The given protein belongs to Group B.")
    print(f"Group A deviation = {round(group_a_deviation,2)}")
    print(f"Group B deviation = {round(group_b_deviation,2)}")

count = 1
for sequence in [seq1,seq2,seq3]:
    print(f"Sequence {count}")
    classify(sequence)
    print("")
    count += 1

Sequence 1
The given protein belongs to Group A.
Group A deviation = 55.85
Group B deviation = 58.53

Sequence 2
The given protein belongs to Group A.
Group A deviation = 74.52
Group B deviation = 76.84

Sequence 3
The given protein belongs to Group B.
Group A deviation = 38.34
Group B deviation = 32.6



# Residue Pair Preference

In [5]:
def dipeptide_composition_1(sequence):
    
    residues = ["G", "A", "V", "L", "I", "P", "F", "Y", "W", "S", "T", "C", "M", "N", "Q", "D", "E", "K", "R", "H"]
    dipeptide_counts = {residue1:{residue2:0 for residue2 in residues} for residue1 in residues}
    
    for i in range(len(sequence)-1):
        dipeptide = sequence[i:i+2]
        dipeptide_counts[dipeptide[0]][dipeptide[1]] += 1
    
    comp = composition(sequence)
    dipeptide_composition = {residue1:{residue2:round(100*dipeptide_counts[residue1][residue2]/((len(sequence)/100)*(comp[residue1]+comp[residue2])),2) if comp[residue1]+comp[residue2] != 0 else 0 for residue2 in residues} for residue1 in residues}
    
    return pd.DataFrame(dipeptide_composition).transpose()

dip_comp_1_1 = dipeptide_composition_1(seq1)
dip_comp_1_2 = dipeptide_composition_1(seq2)
dip_comp_1_3 = dipeptide_composition_1(seq3)

In [6]:
def dipeptide_composition_2(sequence):
    
    residues = ["G", "A", "V", "L", "I", "P", "F", "Y", "W", "S", "T", "C", "M", "N", "Q", "D", "E", "K", "R", "H"]
    dipeptide_counts = {residue1:{residue2:0 for residue2 in residues} for residue1 in residues}
    
    for i in range(len(sequence)-1):
        dipeptide = sequence[i:i+2]
        dipeptide_counts[dipeptide[0]][dipeptide[1]] += 1
    
    dipeptide_composition = {residue1:{residue2:round(100*dipeptide_counts[residue1][residue2]/(len(sequence)-1),2) for residue2 in residues} for residue1 in residues}
    
    return pd.DataFrame(dipeptide_composition).transpose()

dip_comp_2_1 = dipeptide_composition_2(seq1)
dip_comp_2_2 = dipeptide_composition_2(seq2)
dip_comp_2_3 = dipeptide_composition_2(seq3)

In [7]:
def dipeptide_composition_3(sequence):
    
    residues = ["G", "A", "V", "L", "I", "P", "F", "Y", "W", "S", "T", "C", "M", "N", "Q", "D", "E", "K", "R", "H"]
    dipeptide_counts = {residue1:{residue2:0 for residue2 in residues} for residue1 in residues}
    
    for i in range(len(sequence)-1):
        dipeptide = sequence[i:i+2]
        dipeptide_counts[dipeptide[0]][dipeptide[1]] += 1
    
    comp = composition(sequence)
    dipeptide_composition = {residue1:{residue2:round(100*dipeptide_counts[residue1][residue2]/((len(sequence)**2/100**2)*(comp[residue1]*comp[residue2])),2) if comp[residue1]*comp[residue2] != 0 else 0 for residue2 in residues} for residue1 in residues}
    
    return pd.DataFrame(dipeptide_composition).transpose()

dip_comp_3_1 = dipeptide_composition_3(seq1)
dip_comp_3_2 = dipeptide_composition_3(seq2)
dip_comp_3_3 = dipeptide_composition_3(seq3)

In [8]:
from heapq import nlargest

def find_top_ten(dip_comp):
    
    values = {residue1+residue2:dip_comp.loc[residue1][residue2] for residue1 in dip_comp.index for residue2 in dip_comp.columns}
    top_ten = nlargest(10, values, key=values.get)
    
    return top_ten

In [9]:
find_top_ten(dip_comp_3_3)

['MT', 'KH', 'IH', 'HY', 'WD', 'PN', 'QP', 'RL', 'GM', 'IE']

# Average Values

In [10]:
def hydrophobicity(sequence):
    
    data = {'A': 13.85, 'D': 11.61, 'C': 15.37, 'E': 11.38, 'F': 13.93, 'G': 13.34, 'H': 13.82, 'I': 15.28, 'K': 11.58, 'L': 14.13, 'M': 13.86, 'N': 13.02, 'P': 12.35, 'Q': 12.61, 'R': 13.10, 'S': 13.39, 'T': 12.70, 'V': 14.56, 'W': 15.48, 'Y': 13.88}
    
    total = 0
    
    for residue in sequence:
        total += data[residue]
    
    return round(total / len(sequence), 2)

In [11]:
def helical_contact_area(sequence):
    
    data = {'A': 20.0, 'D': 26.0, 'C': 25.0, 'E': 33.0, 'F': 46.0, 'G': 13.0, 'H': 37.0, 'I': 39.0, 'K': 46.0, 'L': 35.0, 'M': 43.0, 'N': 28.0, 'P': 22.0, 'Q': 36.0, 'R': 55.0, 'S': 20.0, 'T': 28.0, 'V': 33.0, 'W': 61.0, 'Y': 46.0}
    
    total = 0
    
    for residue in sequence:
        total += data[residue]
    
    return round(total,2)

In [12]:
def total_nonbonded_energy(sequence):
    
    data = {'A': 1.9, 'D': 1.52, 'C': 2.04, 'E': 1.54, 'F': 1.86, 'G': 1.9, 'H': 1.76, 'I': 1.95, 'K': 1.37, 'L': 1.97, 'M': 1.96, 'N': 1.56, 'P': 1.7, 'Q': 1.52, 'R': 1.48, 'S': 1.75, 'T': 1.77, 'V': 1.98, 'W': 1.87, 'Y': 1.69}

    total = 0
    
    for residue in sequence:
        total += data[residue]
    
    return round(total,2)

In [13]:
properties = pd.DataFrame({seq:[hydrophobicity(seq),helical_contact_area(seq),total_nonbonded_energy(seq)] for seq in [seq1,seq2,seq3]})
properties.columns = ["Seq. 1", "Seq. 2", "Seq. 3"]
properties = properties.transpose()
properties.columns = ["Average Hydrophobicity", "Helical Contact Area", "Total Non-Bonded Energy"]
properties

Unnamed: 0,Average Hydrophobicity,Helical Contact Area,Total Non-Bonded Energy
Seq. 1,13.35,2156.0,117.74
Seq. 2,13.77,2067.0,126.66
Seq. 3,13.42,4616.0,267.75
