In [48]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import torch

# Load Dataset


In [24]:
train_proteins = pd.read_csv('train_proteins.csv')
train_peptides = pd.read_csv('train_peptides.csv')
train_clinical = pd.read_csv('train_clinical_data.csv')

## Basic Information
UPDRS is a rating instrument used to measure the the severity and progression of Parkinson’s disease in patients. When a patient visits the clinic, the clinic will record how the patient scored on 4 parts of UPDRS test. This data can be found in train_clinical. The ratings for the the first 4 segments of UPDRS are available as updrs_1, updrs_2, updrs_3 and updrs_4 in train_clinical. Our goal is to train a model to predict these UPDRS ratings.

The clinic will also record the patient's NPX(Normalized Protein eXpression) value for all the proteins relevant to Parkinson's disease during each visit. NPX is nothing but the value representing the protein concentration in shells. This data is available in the train_proteins DataFrame.

Proteins are long molecules made up of multiple peptides. The clinic will record the Peptide Abundance of each peptide in proteins relevant to Parkinson's disease. It shows the peptide concentration, similar to NPX for proteins. This data can be found in the train_peptides DataFrame.



In [25]:
print('clinical dataset shape:  ', train_clinical.shape)
print('proteins dataset shape:  ', train_proteins.shape)
print('peptides dataset shape:  ', train_peptides.shape)



clinical dataset shape:   (2615, 8)
proteins dataset shape:   (232741, 5)
peptides dataset shape:   (981834, 6)


In [26]:
train_clinical.head(5)


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [27]:
train_proteins.head(5)

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [28]:
train_peptides.head(5)

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


# Grouping and Merging


In [29]:
# Function to prepare dataset with all the steps mentioned above:
def prepare_dataset(train_proteins, train_peptides):
    # Step 1: Grouping 
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].mean().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()
    
    # Step 2: Pivoting
    df_protein = df_protein_grouped.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
    df_peptide = df_peptide_grouped.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()
    
    # Step 3: Merging
    mix_dataset = df_protein.merge(df_peptide, on = ['visit_id'], how = 'left')
    
    return mix_dataset

mix_dataset=prepare_dataset(train_proteins, train_peptides)
mix_dataset

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.30
1,10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80
2,10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,...,289888.0,8615.27,8770410.0,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,...,173259.0,4767.63,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,
1110,942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,...,185428.0,5554.53,,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60
1111,942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,...,137611.0,6310.09,,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30


In [30]:
mix_dataset.describe()

Unnamed: 0,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
count,764.0,1112.0,1100.0,1032.0,1047.0,942.0,1113.0,1050.0,1110.0,1079.0,...,1100.0,994.0,1022.0,977.0,1091.0,1111.0,1089.0,1105.0,1030.0,865.0
mean,11641.264435,511164.9,26505.529157,27305.934884,17688.295406,3004.990691,126151.780054,50773.474638,195599.363694,145382.047368,...,215246.833636,9015.134433,3937256.0,67866.452927,611077.9,92581.223041,125937.993436,471554.596652,47068.709311,21072.04823
std,2817.00353,235735.7,10705.15254,8446.187506,7166.325369,1142.159575,72748.393517,21382.028764,79739.704279,58028.089713,...,50727.227704,3127.061637,1677710.0,61176.002825,504742.2,30138.957433,38696.448657,131150.715245,13689.667117,10360.5938
min,873.778,59718.2,591.103,8945.34,2811.12,336.517,10717.4,5806.84,29740.9,8358.08,...,12164.3,258.249,162464.0,884.26,7494.66,869.898,991.452,11371.2,6362.49,868.903
25%,9736.8575,349059.0,19941.075,21123.65,12920.05,2189.0875,70560.6,37008.975,142054.5,103983.0,...,186658.25,6899.805,2895622.0,30997.9,292950.5,70508.15,100691.0,384902.0,37752.375,14249.9
50%,11546.4,483442.5,26529.7,26624.0,17399.6,2865.46,116900.0,50375.8,185616.0,136452.0,...,217430.5,8604.345,3671010.0,52576.9,470245.0,88918.6,123588.0,463382.0,45503.15,20390.9
75%,13383.025,648557.2,33222.8,32459.275,22077.05,3593.1475,164947.0,63446.7,239731.5,177451.0,...,246423.5,10612.775,4710635.0,85369.6,759238.0,110140.0,149597.0,549455.0,54748.35,27031.9
max,21361.8,1806980.0,66252.4,65347.9,49695.6,9352.64,538862.0,137369.0,766591.0,427084.0,...,409939.0,27670.5,13855500.0,712856.0,3984710.0,251526.0,264224.0,948416.0,107220.0,70020.8


# Split dataset into Train and Test


In [43]:
def split_dataset(dataset, test_ratio=0.20):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train, test = split_dataset(mix_dataset)

In [45]:
train

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,202274.0,,4401830.0,77482.60,583075.0,76705.7,104260.0,530223.0,,7207.30
1,10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,...,201009.0,,5001750.0,36745.30,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80
2,10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,220728.0,,5424380.0,39016.00,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70
3,10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,...,188362.0,9433.71,3900280.0,48210.30,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
5,10138_36,13500.90,504239.0,8143.43,25413.7,22013.30,5230.44,128960.0,53157.2,219837.0,...,212239.0,10717.30,5509020.0,50965.70,284677.0,91630.4,133260.0,585615.0,59697.5,24827.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,8344_6,,166738.0,25668.60,21771.6,,2784.08,39135.4,24714.0,105742.0,...,198917.0,5591.53,3686840.0,16901.50,138884.0,55525.2,100245.0,354351.0,28996.2,25412.10
1107,8699_12,8194.96,258441.0,15176.50,,12719.40,5310.69,38205.6,18026.0,179373.0,...,245138.0,7080.13,5823240.0,1676.07,700364.0,76111.4,124881.0,382365.0,42910.8,47741.10
1108,8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,...,289888.0,8615.27,8770410.0,33599.10,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70
1110,942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,...,185428.0,5554.53,,64049.80,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60


In [46]:
test

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
4,10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09
6,10174_0,11852.40,474771.0,18406.40,28897.9,25497.90,5173.80,239423.0,53264.4,232766.0,...,273526.0,7963.54,4957300.0,144966.0,836762.0,218106.0,150196.0,490093.0,40410.9,20855.40
15,10715_0,,316400.0,20988.50,17683.7,9126.46,3038.81,86087.3,35985.9,115798.0,...,237152.0,4015.29,2554690.0,37151.9,374725.0,50345.0,97198.6,563730.0,30567.1,22225.60
22,10718_36,11056.60,455813.0,6425.16,20836.5,,,83669.4,50120.6,139273.0,...,221403.0,7113.56,3361220.0,49246.7,907132.0,58385.5,140381.0,494567.0,47336.0,29178.60
24,11459_0,13541.60,421862.0,27429.90,21302.7,21044.50,1786.04,110972.0,62374.6,177221.0,...,264438.0,,5231670.0,,1231820.0,105571.0,161194.0,725132.0,66307.7,28106.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,7886_24,,400514.0,26950.30,18594.5,14878.30,4096.41,53894.0,43646.5,123652.0,...,227601.0,6511.68,3629830.0,31546.5,181075.0,75477.3,152511.0,563899.0,62233.9,
1104,8344_54,10795.80,163812.0,28350.60,19465.0,9160.85,3521.52,28097.5,27075.7,105371.0,...,226660.0,5662.71,3969460.0,15542.0,237668.0,58392.4,112817.0,460942.0,41081.4,37209.00
1106,8699_0,7361.28,484747.0,21651.70,,14695.20,5319.25,121582.0,45768.0,264622.0,...,225768.0,7277.05,7226470.0,36474.2,629896.0,69699.3,103876.0,490302.0,54562.4,40784.10
1109,942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,...,173259.0,4767.63,374307.0,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,


# Readying for Training


We train a model to predict updrs_1, updrs_2, updrs_3, updrs_4.


In [52]:
feat = [x for x in mix_dataset.columns if x not in ["visit_id"]]
feat.append("visit_month")


In [None]:
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]

# Logistic Regression

In [51]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

In [None]:
n_inputs = 1196
n_outputs = 1