# Parkinson's Disease Progression Prediction
### Project - SI 618: Data Manipulation and Analysis

Authors
1. Nowrin Mohamed - nowrin@umich.edu
2. Prithvijit Dasgupta - prithvid@umich.edu
3. Sachin Salim - sachinks@umich.edu

@misc{amp-parkinsons-disease-progression-prediction,
    author = {Leslie Kirsch, Sohier Dane, Stacey Adam, Victoria Dardov},
    title = {AMP®-Parkinson's Disease Progression Prediction},
    publisher = {Kaggle},
    year = {2023},
    url = {https://kaggle.com/competitions/amp-parkinsons-disease-progression-prediction}
}

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
supplemental_clinical_data = pd.read_csv('data/supplemental_clinical_data.csv')
peptides_data = pd.read_csv('data/train_peptides.csv') 
clinical_data = pd.read_csv('data/train_clinical_data.csv') 
proteins_data = pd.read_csv('data/train_proteins.csv')

# Proteins

### Feature extraction

In [4]:
proteins_features = proteins_data.pivot(index='visit_id', columns='UniProt', values='NPX')
proteins_features

UniProt,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,129048.0,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.40
10053_12,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,108114.0,...,,14408.40,,,28537.0,171733.0,65668.1,,9295.65,25697.80
10053_18,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,163776.0,...,317477.0,38667.20,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.70
10138_12,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,56725.0,...,557904.0,44556.90,155619.0,14647.90,36927.7,229232.0,106564.0,26077.7,21441.80,7642.42
10138_24,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,85767.1,...,,47836.70,177619.0,17061.10,25510.4,176722.0,59471.4,12639.2,15091.40,6168.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8699_24,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,138910.0,...,,25690.60,,6859.82,19106.7,121161.0,113872.0,14413.9,28225.50,8062.07
942_12,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,100519.0,...,45742.3,33518.60,94049.7,13415.70,21324.7,234094.0,82410.4,19183.7,17804.10,12277.00
942_24,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,99183.5,...,180475.0,29770.60,95949.9,11344.40,23637.6,256654.0,76931.9,19168.2,19215.90,14625.60
942_48,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,84875.1,...,197987.0,29283.80,121696.0,19169.80,16724.9,232301.0,96905.9,21120.9,14089.80,16418.50


### Filling nan

Filling with average of each protein

In [5]:
proteins_features.fillna(proteins_features.mean(), inplace=True)

# Clinical data

### Feature extraction

In [21]:
clinical_features = clinical_data[['visit_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].copy()
clinical_features

Unnamed: 0,visit_id,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,10.0,6.0,6,
1,55_3,10.0,7.0,19,
2,55_6,8.0,10.0,58,
3,55_9,8.0,9.0,17,0.0
4,55_12,10.0,10.0,2,0.0
...,...,...,...,...,...
2610,65043_48,7.0,6.0,51,0.0
2611,65043_54,4.0,8.0,55,1.0
2612,65043_60,6.0,6.0,0,1.0
2613,65043_72,3.0,9.0,58,1.0


### Filling nan

Filling with average of each updrs

In [26]:
mean_values = clinical_features.mean(numeric_only=True).astype(int)
clinical_features.fillna(mean_values, inplace=True)
clinical_features[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']] = clinical_features[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].astype('int64')
display(clinical_features.head())
display(clinical_features.info())

Unnamed: 0,visit_id,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,10,6,6,1
1,55_3,10,7,19,1
2,55_6,8,10,58,1
3,55_9,8,9,17,0
4,55_12,10,10,2,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2615 entries, 0 to 2614
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   visit_id  2615 non-null   object
 1   updrs_1   2615 non-null   int64 
 2   updrs_2   2615 non-null   int64 
 3   updrs_3   2615 non-null   int64 
 4   updrs_4   2615 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 102.3+ KB


None

: 