In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, SGD, Adagrad
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from scipy.stats import uniform, randint
import pickle
import amp_pd_peptide
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBClassifier
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold
import re
import requests

In [2]:
def smape_plus_one(y_true, y_pred):
    
    y_true = np.array(y_true)+1
    y_pred = np.round(np.array(y_pred)+1)
    
    smap = np.zeros(len(y_true))

    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)

    pos_ind = dem != 0
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]

    return 100 * np.mean(smap)

# define settings

In [3]:
fill_na = 'nofill'   # "median" to fill with median and ffill, "knn" to fill with knn, "nofill" to drop NaN values
sum_col = True # True if we want to use the ponderated updrs sum
med_cols = False # True if we want to create features with the median values
visit = 3    # 3 no use of visit_month, 2 use of visit_month

# CV cross validation, final final models, submit submit
point = 'final'

In [4]:
# Set random seed
np.random.seed(13)

# Load train datasets

train_clinical_df = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
train_peptides_df = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
train_proteins_df = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
supplemental_clinical_data = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv")

# functions to get and create new protein features

import requests
import pandas as pd


# Identify unique UniProt IDs in your dataframe
unique_protein_ids = train_proteins_df['UniProt'].unique()
print(len(unique_protein_ids))

# Define function to retrieve UniProt annotations for a list of protein IDs
def get_uniprot_annotations(protein_ids):
    annotations = {}
    for protein_id in protein_ids:
        url = f'https://www.uniprot.org/uniprot/{protein_id}.txt'
        response = requests.get(url)
        for line in response.text.split('\n'):
            if line.startswith('DR'):
                fields = line.split(';')
                if 'GO' in fields[1]:
                    if len(fields)>2:
                        go_id = fields[2].strip()
                    if len(fields)>3:
                        go_term = fields[3].strip()
                        if protein_id not in annotations:
                            annotations[protein_id] = {}
                        annotations[protein_id][go_id] = go_term
    return annotations

# Retrieve UniProt annotations for the unique protein IDs
unique_annotations = get_uniprot_annotations(unique_protein_ids)

# Map the annotations back to the original dataframe
train_proteins_df['UniProt_annotations'] = train_proteins_df['UniProt'].apply(lambda x: unique_annotations.get(x, {}))
train_proteins_df

unique_annotations = set(train_proteins_df['UniProt_annotations'].apply(lambda x: tuple(sorted(x.items()))))
num_annotations = len(unique_annotations)
print(f"There are {num_annotations} unique UniProt annotations.")

# Define function to retrieve ProteomicsDB expression data
def get_proteomicsdb_expression(protein_id):
    url = f'https://www.proteomicsdb.org/proteomicsdb/data/export/entry_export_{protein_id}.tsv'
    response = requests.get(url)
    expression = {}
    for line in response.text.split('\n'):
        fields = line.split('\t')
        if len(fields) == 3:
            tissue = fields[0].strip()
            level = float(fields[1].strip())
            expression[tissue] = level
    return expression

# Get unique UniProt IDs
unique_proteins = train_proteins_df['UniProt'].unique()

# Create empty dictionary to store expression data
expression_dict = {}

# Retrieve expression data for each unique protein
for protein_id in unique_proteins:
    expression_dict[protein_id] = get_proteomicsdb_expression(protein_id)

# Add expression data to the dataframe
train_proteins_df['ProteomicsDB_expression'] = train_proteins_df['UniProt'].apply(lambda x: expression_dict[x])
train_proteins_df

# prepare train dataset

In [5]:
# Calculate medians and model for imputing only with month
target_columns_clinical_data = ['updrs_1']
target_columns_clinical_and_supplemental_data = ['updrs_2', 'updrs_3', 'updrs_4']

target_visit_month_medians_clinical_data = train_clinical_df.groupby('visit_month')[target_columns_clinical_data].median()
target_visit_month_medians_clinical_and_supplemental_data = pd.concat((
    train_clinical_df,
    supplemental_clinical_data
), axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].median()

# Drop 5th month visit that is coming from the supplemental clinical data
target_visit_month_medians_clinical_and_supplemental_data = target_visit_month_medians_clinical_and_supplemental_data.drop(5)

# Concatenate visit_month medians of targets
target_visit_month_medians = pd.concat((
    target_visit_month_medians_clinical_data,
    target_visit_month_medians_clinical_and_supplemental_data
), axis=1, ignore_index=False)

# Replace expanding window max of updrs values with current updrs values
target_visit_month_medians = target_visit_month_medians.expanding(min_periods=1).max().reset_index()
target_visit_month_medians


Unnamed: 0,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,0,4.5,4.0,18.0,0.0
1,3,4.5,5.0,19.0,0.0
2,6,6.0,6.0,21.0,0.0
3,9,6.0,6.0,21.0,0.0
4,12,6.0,6.0,21.0,0.0
5,18,6.0,6.0,21.0,0.0
6,24,6.0,6.0,21.0,0.0
7,30,7.0,6.0,22.0,0.0
8,36,7.0,6.0,22.0,0.0
9,42,7.0,7.0,23.0,0.0


In [6]:
# pivoted peptides
pivoted_pept_df = pd.pivot_table(train_peptides_df, values='PeptideAbundance', index='visit_id', columns='Peptide', aggfunc=np.sum, fill_value=0)
pivoted_pept_df.reset_index()

Peptide,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,6580710,31204.4,7735070,0.0,0.00,0.0,46620.3,236144.0,0.0,...,202274.0,0.00,4401830,77482.6,583075.0,76705.7,104260.0,530223.0,0.0,7207.30
1,10053_12,6333510,52277.6,5394390,0.0,0.00,0.0,57554.5,108298.0,45885.4,...,201009.0,0.00,5001750,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80
2,10053_18,7129640,61522.0,7011920,35984.7,17188.00,19787.3,36029.4,708729.0,5067790.0,...,220728.0,0.00,5424380,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70
3,10138_12,7404780,46107.2,10610900,0.0,20910.20,66662.3,55253.9,79575.5,6201210.0,...,188362.0,9433.71,3900280,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,13788300,56910.3,6906160,13785.5,11004.20,63672.7,36819.8,34160.9,2117430.0,...,206187.0,6365.15,3521800,69984.6,496737.0,80919.3,111799.0,0.0,56977.6,4903.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,6312970,44462.7,12455000,11051.3,1163.18,43279.8,67743.5,325328.0,4666550.0,...,289888.0,8615.27,8770410,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70
1109,942_12,11289900,46111.7,11297300,0.0,13894.10,53755.0,40289.3,565112.0,0.0,...,173259.0,4767.63,374307,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,0.00
1110,942_24,10161900,32145.0,12388000,25869.2,17341.80,48625.5,45223.9,84448.0,4684800.0,...,185428.0,5554.53,0,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60
1111,942_48,8248490,30563.4,11882600,0.0,19114.90,60221.4,46685.9,81282.9,5542110.0,...,137611.0,6310.09,0,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30


In [7]:
# load go_df
grouped_df = pd.read_csv("/kaggle/input/models/go_proteins_df_grouped.csv")
grouped_df

Unnamed: 0,visit_id,GO:0003094,GO:0120020,GO:0014911,GO:0045947,GO:0038020,GO:0070062,GO:0097114,GO:0061635,GO:1904237,...,GO:0014065,GO:0030017,GO:0044877,GO:0046836,GO:0048048,GO:0002034,GO:0030169,GO:0046904,GO:0032700,GO:0000015
0,10053_0,1981360,9856899,878672,65900,234204,465833739,5158030,58286,1280490,...,0,0,27795556,87443,1051980,1362436,0,376815,292483,0
1,10053_12,1844759,10696363,913336,65668,230542,460776415,5201730,62091,1248460,...,0,0,25899966,75270,738247,1294853,27432,240800,289648,0
2,10053_18,1638900,14997144,1009310,59986,281995,514930764,8381290,69448,1769390,...,31056,13393,36832846,220833,843926,1753353,7764,420975,291653,4487
3,10138_12,5661841,37386027,1521000,106564,400221,643537600,12307400,66891,1562330,...,17342,10758,40382404,155832,816347,2327903,27928,435038,296983,7524
4,10138_24,3212301,22452124,1362240,59471,423551,663870771,10068700,85381,1585900,...,19484,13478,52103292,158536,598882,1790444,18208,473383,475865,4399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,8699_24,4667083,35856331,1680950,113872,185405,655368264,9480780,85995,1737010,...,27348,13629,36786992,171568,530973,1723864,42963,341908,464751,6584
1109,942_12,936951,23424722,809703,82410,204100,547894586,7939170,62675,1137330,...,0,14370,40361745,219038,665474,1592590,25162,372429,290752,8590
1110,942_24,937573,24334809,654595,76931,210042,561516992,7161030,76887,1124410,...,0,11601,38154353,200262,643751,1190962,23646,256005,271587,7664
1111,942_48,1087069,32187994,912004,96905,198135,591755952,8644310,72340,1042130,...,0,12804,36974101,224349,492658,1545162,19440,296240,318640,7381


In [8]:
# define a function to get the corresponding UPDRS values
def get_updrs(row):
    # get the closest row in target_visit_month_medians
    closest_row = target_visit_month_medians.iloc[(target_visit_month_medians['visit_month']-row['visit_month']).abs().argsort()[0]]
    # get the corresponding UPDRS values
    updrs_1 = closest_row['updrs_1']
    updrs_2 = closest_row['updrs_2']
    updrs_3 = closest_row['updrs_3']
    updrs_4 = closest_row['updrs_4']
    # return a new series with the corresponding UPDRS values
    return pd.Series({'updrs_1': updrs_1, 'updrs_2': updrs_2, 'updrs_3': updrs_3, 'updrs_4': updrs_4})

In [9]:
# Prepare pivoted df of proteins    
df = pd.merge(grouped_df, train_clinical_df, on='visit_id')

df.insert(1, 'patient_id', df.pop('patient_id'))
df.insert(2, 'visit_month', df.pop('visit_month'))
df = df.drop('upd23b_clinical_state_on_medication', axis=1)

df = df.sort_values(['patient_id','visit_month'])
df

Unnamed: 0,visit_id,patient_id,visit_month,GO:0003094,GO:0120020,GO:0014911,GO:0045947,GO:0038020,GO:0070062,GO:0097114,...,GO:0048048,GO:0002034,GO:0030169,GO:0046904,GO:0032700,GO:0000015,updrs_1,updrs_2,updrs_3,updrs_4
814,55_0,55,0,1940704,28883082,1156340,0,334859,687250139,15487900,...,1611130,2307020,43209,396955,507483,9988,10.0,6.0,15.0,
817,55_6,55,6,2041127,26587729,1218080,0,298466,632508976,13376000,...,1483200,2020728,42570,402313,567314,7565,8.0,10.0,34.0,
815,55_12,55,12,1987591,28499891,1229910,65762,404240,688565082,16794500,...,1547140,2382398,47865,405112,571323,10010,10.0,10.0,41.0,0.0
816,55_36,55,36,2074784,26864664,1090610,74976,435232,755808757,11734700,...,1725030,2659245,36168,483287,462881,10085,17.0,18.0,51.0,0.0
1067,942_6,942,6,941030,31964041,1455960,82335,188725,528161920,12340200,...,637932,1608430,5439,352676,326540,9284,8.0,2.0,21.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,64674_84,64674,84,3247722,17023042,1751820,62217,229489,424420898,4029010,...,578556,1418148,7539,324458,155880,0,11.0,15.0,45.0,4.0
1023,65043_0,65043,0,972199,36696770,942766,37584,365236,640816559,24488400,...,1805510,2192994,13489,380962,617721,11517,2.0,6.0,16.0,
1024,65043_12,65043,12,980092,33696834,1059030,40588,393845,645377678,21073800,...,1703120,2232901,59468,384273,665367,9704,4.0,7.0,14.0,0.0
1025,65043_24,65043,24,1014144,37208227,1049620,36150,469670,697632853,19897000,...,1667250,2710603,53897,416887,664180,9745,4.0,8.0,,0.0


In [10]:
# apply the function to each row in df to create a new dataframe with the corresponding UPDRS values
updrs_df = df.apply(get_updrs, axis=1)
updrs_df

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4
814,4.5,4.0,18.0,0.0
817,6.0,6.0,21.0,0.0
815,6.0,6.0,21.0,0.0
816,7.0,6.0,22.0,0.0
1067,6.0,6.0,21.0,0.0
...,...,...,...,...
1022,7.0,9.0,29.0,3.0
1023,4.5,4.0,18.0,0.0
1024,6.0,6.0,21.0,0.0
1025,6.0,6.0,21.0,0.0


In [11]:
if fill_na == 'knn':
    
    print("filling NaN values with values from similar rows kNN")

    # Create a KNN imputer with k=5
    updrs_imputer = KNNImputer(n_neighbors=5)

    # Fit the imputer to the data and transform the missing values
    df.iloc[:,-4:] = updrs_imputer.fit_transform(df.iloc[:,-4:])
    
elif fill_na == 'median':
    
    print("filling NaN values with the median values")
    
    # Group by patient_id
    grouped = df.groupby('patient_id')

    # Fill the NaN values for each group
    for patient_id, group in grouped:
        group.loc[:, ['updrs_1','updrs_2','updrs_3','updrs_4']] = group.loc[:, ['updrs_1','updrs_2','updrs_3','updrs_4']].fillna(method='ffill').fillna(method='bfill')
        df.loc[group.index, ['updrs_1','updrs_2','updrs_3','updrs_4']] = group[['updrs_1','updrs_2','updrs_3','updrs_4']]
    df.fillna(updrs_df, inplace=True)

    
# if we want to use a feature that is the median value
if med_cols:
    
    print("creating 4 new feature columns with the theoretical median values")
    
    df[['updrs_1_med','updrs_2_med','updrs_3_med','updrs_4_med']] = updrs_df
    
    # create a list of all columns in the desired order
    cols = list(df.columns)
    cols.remove('updrs_1')
    cols.remove('updrs_2')
    cols.remove('updrs_3')
    cols.remove('updrs_4')
    cols += ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

    # reindex the DataFrame with the new column order
    df = df.reindex(columns=cols)

# sort df by patient_id and by visit_month
df = df.sort_values(['patient_id','visit_month'])

# if we want to use the ponderated sum of all the updrs as feature
if sum_col:
    print("creating a new feature updrs_sum as the ponderated sum of all updrs values")
    
    # divide each updrs value by its number of questions
    df['updrs_sum'] = df['updrs_1']/13+df['updrs_2']/13+df['updrs_3']/14+df['updrs_4']/6
    df.insert(len(df.columns)-5,'updrs_sum',df.pop('updrs_sum'))

df

creating a new feature updrs_sum as the ponderated sum of all updrs values


Unnamed: 0,visit_id,patient_id,visit_month,GO:0003094,GO:0120020,GO:0014911,GO:0045947,GO:0038020,GO:0070062,GO:0097114,...,GO:0002034,GO:0030169,GO:0046904,GO:0032700,GO:0000015,updrs_sum,updrs_1,updrs_2,updrs_3,updrs_4
814,55_0,55,0,1940704,28883082,1156340,0,334859,687250139,15487900,...,2307020,43209,396955,507483,9988,,10.0,6.0,15.0,
817,55_6,55,6,2041127,26587729,1218080,0,298466,632508976,13376000,...,2020728,42570,402313,567314,7565,,8.0,10.0,34.0,
815,55_12,55,12,1987591,28499891,1229910,65762,404240,688565082,16794500,...,2382398,47865,405112,571323,10010,4.467033,10.0,10.0,41.0,0.0
816,55_36,55,36,2074784,26864664,1090610,74976,435232,755808757,11734700,...,2659245,36168,483287,462881,10085,6.335165,17.0,18.0,51.0,0.0
1067,942_6,942,6,941030,31964041,1455960,82335,188725,528161920,12340200,...,1608430,5439,352676,326540,9284,,8.0,2.0,21.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,64674_84,64674,84,3247722,17023042,1751820,62217,229489,424420898,4029010,...,1418148,7539,324458,155880,0,5.880952,11.0,15.0,45.0,4.0
1023,65043_0,65043,0,972199,36696770,942766,37584,365236,640816559,24488400,...,2192994,13489,380962,617721,11517,,2.0,6.0,16.0,
1024,65043_12,65043,12,980092,33696834,1059030,40588,393845,645377678,21073800,...,2232901,59468,384273,665367,9704,1.846154,4.0,7.0,14.0,0.0
1025,65043_24,65043,24,1014144,37208227,1049620,36150,469670,697632853,19897000,...,2710603,53897,416887,664180,9745,,4.0,8.0,,0.0


In [12]:
# merge df with pivoted peptide dataframe
df = df.merge(pivoted_pept_df, on='visit_id')

# if we are using the median columns
if med_cols:
    df_updrs = df[['updrs_1_med','updrs_2_med','updrs_3_med','updrs_4_med']]
    df = df.drop(['updrs_1_med','updrs_2_med','updrs_3_med','updrs_4_med'], axis=1)
    df[['updrs_1_med','updrs_2_med','updrs_3_med','updrs_4_med']] = df_updrs

# if we are using the ponderated sum column
if sum_col:
    df.insert(len(df.columns)-1,'updrs_sum',df.pop('updrs_sum'))

#reorder df
df_updrs = df[['updrs_1','updrs_2','updrs_3','updrs_4']].replace(0,-5)
df = df.drop(['updrs_1','updrs_2','updrs_3','updrs_4'], axis=1)
df[['updrs_1','updrs_2','updrs_3','updrs_4']] = df_updrs

df

Unnamed: 0,visit_id,patient_id,visit_month,GO:0003094,GO:0120020,GO:0014911,GO:0045947,GO:0038020,GO:0070062,GO:0097114,...,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK,updrs_sum,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,1940704,28883082,1156340,0,334859,687250139,15487900,...,131155.0,165851.0,437305.0,46289.2,14898.4,,10.0,6.0,15.0,
1,55_6,55,6,2041127,26587729,1218080,0,298466,632508976,13376000,...,103512.0,144607.0,457891.0,40047.7,20703.9,,8.0,10.0,34.0,
2,55_12,55,12,1987591,28499891,1229910,65762,404240,688565082,16794500,...,136943.0,181763.0,452253.0,54725.1,21841.1,4.467033,10.0,10.0,41.0,-5.0
3,55_36,55,36,2074784,26864664,1090610,74976,435232,755808757,11734700,...,128593.0,203680.0,498621.0,52792.7,13973.7,6.335165,17.0,18.0,51.0,-5.0
4,942_6,942,6,941030,31964041,1455960,82335,188725,528161920,12340200,...,80001.2,79661.9,573300.0,48005.8,15674.1,,8.0,2.0,21.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,64674_84,64674,84,3247722,17023042,1751820,62217,229489,424420898,4029010,...,49250.4,64076.3,667993.0,38472.5,21949.1,5.880952,11.0,15.0,45.0,4.0
1064,65043_0,65043,0,972199,36696770,942766,37584,365236,640816559,24488400,...,156148.0,157548.0,336625.0,48423.2,10915.8,,2.0,6.0,16.0,
1065,65043_12,65043,12,980092,33696834,1059030,40588,393845,645377678,21073800,...,159828.0,161207.0,330337.0,45368.1,19023.2,1.846154,4.0,7.0,14.0,-5.0
1066,65043_24,65043,24,1014144,37208227,1049620,36150,469670,697632853,19897000,...,148032.0,192857.0,388125.0,65101.0,20790.1,,4.0,8.0,,-5.0


# model to predict the ponderated sum of updrs

In [13]:
# Model for predicting the ponderated updrs sum
if sum_col:
    
    if fill_na == 'nofill':
        
        dfsum_x = df.dropna(subset=['updrs_sum']).reset_index(drop=True)

        # Extract features and target
        X_sum = dfsum_x.iloc[:, visit:-5]
        y_sum = dfsum_x.iloc[:, -5]

    else:
        
        # Extract features and target
        X_sum = df.iloc[:, visit:-5]
        y_sum = df.iloc[:, -5]
        
    model_sum = xgb.XGBRegressor(max_depth=8,
                                 learning_rate=0.04,
                                 colsample_bytree=0.7,
                                 subsample= 0.8,
                                 n_estimators=250,
                                 reg_alpha=0.15,
                                 reg_lambda=0.15)

    if point == "CV":

        # define the KFold cross-validation iterator with shuffle and random state
        kf = KFold(n_splits=5, shuffle=True, random_state=13)

        # fit the model on each cross-validation subset and evaluate its performance
        smape_plus_1_scores = []
        rmse_mean = []
        y_preds = []
        y_tests = []
        models_sum = {}
        i = 0
        for train_index, test_index in kf.split(X_sum):
            # split the data into training and testing sets
            X_train, X_test = X_sum.values[train_index], X_sum.values[test_index]
            y_train, y_test = y_sum[train_index], y_sum[test_index]

            # fit the model on the training set
            model_sum.fit(X_train, y_train)

            models_sum[i] = model_sum

            # evaluate the model on the testing set using smape+1 metric
            y_pred = model_sum.predict(X_test)
            y_preds.append(y_pred)
            y_tests.append(y_test)
            rmse_mean.append(np.sqrt(mean_squared_error(y_test, y_pred)))
            i += 1

        best_index = rmse_mean.index(min(rmse_mean))
        model_sum = models_sum[best_index]
        print("best rmse:",rmse_mean[best_index])
        print(rmse_mean)

        # assume y_pred and y_test are numpy arrays with the predicted and true values
        residuals = y_tests[best_index] - y_preds[best_index]
        plt.scatter(y_preds[best_index], residuals)
        plt.xlabel("Predicted Values")
        plt.ylabel("Residuals")
        plt.title("Residual Plot")
        plt.show()

    elif point == 'final':

        # fit the model on the training set
        model_sum.fit(X_sum, y_sum)


# models for updrs columns

In [14]:
# Model for predicting updrs_1 month 0

if fill_na == 'nofill':
    df_x = df.dropna(subset=['updrs_1']).reset_index(drop=True)

    # Extract features and target
    X_updrs_1_0 = df_x.iloc[:, visit:-4]
    y_updrs_1_0 = df_x.iloc[:, -4]
else:
    X_updrs_1_0 = df.iloc[:, visit:-4]
    y_updrs_1_0 = df.iloc[:, -4]
    
model_updrs_1_0 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_1_0):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_1_0.values[train_index], X_updrs_1_0.values[test_index]
        y_train, y_test = y_updrs_1_0[train_index], y_updrs_1_0[test_index]

        # fit the model on the training set
        model_updrs_1_0.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_1_0.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_1_0
        i += 1

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_1_0 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == "final":
    
    # fit the model on the training set
    model_updrs_1_0.fit(X_updrs_1_0, y_updrs_1_0)


In [15]:
# model to predict updrs_2 month 0

if fill_na == 'nofill':
    df_x = df.dropna(subset=['updrs_2']).reset_index(drop=True)

    # Extract features and target
    X_updrs_2_0 = df_x.iloc[:, visit:-4]
    y_updrs_2_0 = df_x.iloc[:, -3]
else:
    X_updrs_2_0 = df.iloc[:, visit:-4]
    y_updrs_2_0 = df.iloc[:, -3]

model_updrs_2_0 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_2_0):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_2_0.values[train_index], X_updrs_2_0.values[test_index]
        y_train, y_test = y_updrs_2_0[train_index], y_updrs_2_0[test_index]

        # fit the model on the training set
        model_updrs_2_0.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_2_0.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_2_0
        i += 1

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_2_0 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_2_0.fit(X_updrs_2_0, y_updrs_2_0)


In [16]:
# model to predict updrs_3 month 0

if fill_na == 'nofill':
    df_x = df.dropna(subset=['updrs_3']).reset_index(drop=True)

    # Extract features and target
    X_updrs_3_0 = df_x.iloc[:, visit:-4]
    y_updrs_3_0 = df_x.iloc[:, -2]
else:
    X_updrs_3_0 = df.iloc[:, visit:-4]
    y_updrs_3_0 = df.iloc[:, -2]
    
model_updrs_3_0 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_3_0):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_3_0.values[train_index], X_updrs_3_0.values[test_index]
        y_train, y_test = y_updrs_3_0[train_index], y_updrs_3_0[test_index]

        # fit the model on the training set
        model_updrs_3_0.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_3_0.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_3_0
        i += 1    

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_3_0 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_3_0.fit(X_updrs_3_0, y_updrs_3_0)


In [17]:
# model to predict updrs_4 month 0

if fill_na == 'nofill':
    df_x = df.dropna(subset=['updrs_4']).reset_index(drop=True)

    # Extract features and target
    X_updrs_4_0 = df_x.iloc[:, visit:-4]
    y_updrs_4_0 = df_x.iloc[:, -1]
else:
    X_updrs_4_0 = df.iloc[:, visit:-4]
    y_updrs_4_0 = df.iloc[:, -1]
    
model_updrs_4_0 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_4_0):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_4_0.values[train_index], X_updrs_4_0.values[test_index]
        y_train, y_test = y_updrs_4_0[train_index], y_updrs_4_0[test_index]

        # fit the model on the training set
        model_updrs_4_0.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_4_0.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_4_0
        i += 1   

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_4_0 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_4_0.fit(X_updrs_4_0, y_updrs_4_0)


# Models to predict +6 months from analysis

In [18]:
# prepare dataset of rows with a difference of 6 months
df6 = df.copy()
for i, row in df6.iterrows():
    patient_id = row['patient_id']
    visit_month = row['visit_month']
    visit_month_6 = visit_month + 6
    same_patient_month = df6[(df6['patient_id'] == patient_id) & (df6['visit_month'] == visit_month_6)]
    
    if not same_patient_month.empty:
        df6.at[i, 'updrs_1_6m'] = same_patient_month['updrs_1'].values[0]
        df6.at[i, 'updrs_2_6m'] = same_patient_month['updrs_2'].values[0]
        df6.at[i, 'updrs_3_6m'] = same_patient_month['updrs_3'].values[0]
        df6.at[i, 'updrs_4_6m'] = same_patient_month['updrs_4'].values[0]

# drop rows where all updrs 6m values are NaN
df6.dropna(subset=['updrs_1_6m', 'updrs_2_6m', 'updrs_3_6m', 'updrs_4_6m'], how='all', inplace=True)
df6 = df6.reset_index(drop=True)

df6    

Unnamed: 0,visit_id,patient_id,visit_month,GO:0003094,GO:0120020,GO:0014911,GO:0045947,GO:0038020,GO:0070062,GO:0097114,...,YYWGGQYTWDMAK,updrs_sum,updrs_1,updrs_2,updrs_3,updrs_4,updrs_1_6m,updrs_2_6m,updrs_3_6m,updrs_4_6m
0,55_0,55,0,1940704,28883082,1156340,0,334859,687250139,15487900,...,14898.40,,10.0,6.0,15.0,,8.0,10.0,34.0,
1,55_6,55,6,2041127,26587729,1218080,0,298466,632508976,13376000,...,20703.90,,8.0,10.0,34.0,,10.0,10.0,41.0,-5.0
2,942_6,942,6,941030,31964041,1455960,82335,188725,528161920,12340200,...,15674.10,,8.0,2.0,21.0,,5.0,2.0,25.0,-5.0
3,4161_0,4161,0,2731385,38119406,1240670,67290,154185,621437295,12142400,...,8495.46,,6.0,1.0,-5.0,,1.0,2.0,-5.0,
4,4161_6,4161,6,2002491,33275938,1183140,0,99241,757839541,11279100,...,12285.10,,1.0,2.0,-5.0,,6.0,3.0,11.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,62792_6,62792,6,1713073,25351306,1052070,0,370322,617491182,15757100,...,14742.90,,-5.0,3.0,7.0,,-5.0,2.0,6.0,-5.0
133,64669_0,64669,0,2038298,22404967,891592,63783,269058,518661131,9532530,...,11820.80,,12.0,14.0,27.0,,17.0,20.0,33.0,
134,64669_6,64669,6,2117665,18680402,1105680,58798,311758,497453487,7043480,...,23054.10,,17.0,20.0,33.0,,12.0,9.0,17.0,-5.0
135,64674_0,64674,0,2217558,6748405,1286390,64857,308005,532133772,818945,...,0.00,,5.0,1.0,13.0,,11.0,7.0,20.0,


In [19]:
# Model for predicting updrs_1 month +6

if fill_na == 'nofill':
    df6_x = df6.dropna(subset=['updrs_1_6m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_1_6 = df6_x.iloc[:, visit:-8]
    y_updrs_1_6 = df6_x.iloc[:, -4]
else:
    X_updrs_1_6 = df6.iloc[:, visit:-8]
    y_updrs_1_6 = df6.iloc[:, -4]
    
model_updrs_1_6 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_1_6):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_1_6.values[train_index], X_updrs_1_6.values[test_index]
        y_train, y_test = y_updrs_1_6[train_index], y_updrs_1_6[test_index]

        # fit the model on the training set
        model_updrs_1_6.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_1_6.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_1_6
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_1_6 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_1_6.fit(X_updrs_1_6, y_updrs_1_6)


In [20]:
# Model for predicting updrs_2 month +6

if fill_na == 'nofill':
    df6_x = df6.dropna(subset=['updrs_2_6m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_2_6 = df6_x.iloc[:, visit:-8]
    y_updrs_2_6 = df6_x.iloc[:, -3]
else:
    X_updrs_2_6 = df6.iloc[:, visit:-8]
    y_updrs_2_6 = df6.iloc[:, -3]
    
model_updrs_2_6 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_2_6):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_2_6.values[train_index], X_updrs_2_6.values[test_index]
        y_train, y_test = y_updrs_2_6[train_index], y_updrs_2_6[test_index]

        # fit the model on the training set
        model_updrs_2_6.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_2_6.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_2_6
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_2_6 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_2_6.fit(X_updrs_2_6, y_updrs_2_6)


In [21]:
# Model for predicting updrs_3 month +6

if fill_na == 'nofill':
    df6_x = df6.dropna(subset=['updrs_3_6m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_3_6 = df6_x.iloc[:, visit:-8]
    y_updrs_3_6 = df6_x.iloc[:, -2]
else:
    X_updrs_3_6 = df6.iloc[:, visit:-8]
    y_updrs_3_6 = df6.iloc[:, -2]
    
model_updrs_3_6 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_3_6):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_3_6.values[train_index], X_updrs_3_6.values[test_index]
        y_train, y_test = y_updrs_3_6[train_index], y_updrs_3_6[test_index]

        # fit the model on the training set
        model_updrs_3_6.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_3_6.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_3_6
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_3_6 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
    
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_3_6.fit(X_updrs_3_6, y_updrs_3_6)


In [22]:
# Model for predicting updrs_4 month +6

if fill_na == 'nofill':
    df6_x = df6.dropna(subset=['updrs_4_6m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_4_6 = df6_x.iloc[:, visit:-8]
    y_updrs_4_6 = df6_x.iloc[:, -1]
else:
    X_updrs_4_6 = df6.iloc[:, visit:-8]
    y_updrs_4_6 = df6.iloc[:, -1]
    
model_updrs_4_6 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_4_6):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_4_6.values[train_index], X_updrs_4_6.values[test_index]
        y_train, y_test = y_updrs_4_6[train_index], y_updrs_4_6[test_index]

        # fit the model on the training set
        model_updrs_4_6.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_4_6.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_4_6
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_4_6 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
if point == 'final':
    
    # fit the model on the training set
    model_updrs_4_6.fit(X_updrs_4_6, y_updrs_4_6)


# Models to predict +12 months from analysis

In [23]:
# prepare dataset with rows with a difference of 12 months

df12 = df.copy().sort_values(['patient_id','visit_month'])
for i, row in df12.iterrows():
    patient_id = row['patient_id']
    visit_month = row['visit_month']
    visit_month_12 = visit_month + 12
    same_patient_month = df12[(df12['patient_id'] == patient_id) & (df12['visit_month'] == visit_month_12)]
    
    if not same_patient_month.empty:
        df12.at[i, 'updrs_1_12m'] = same_patient_month['updrs_1'].values[0]
        df12.at[i, 'updrs_2_12m'] = same_patient_month['updrs_2'].values[0]
        df12.at[i, 'updrs_3_12m'] = same_patient_month['updrs_3'].values[0]
        df12.at[i, 'updrs_4_12m'] = same_patient_month['updrs_4'].values[0]

# drop rows where all updrs 6m values are NaN
df12.dropna(subset=['updrs_1_12m', 'updrs_2_12m', 'updrs_3_12m', 'updrs_4_12m'], how='all', inplace=True)
df12 = df12.reset_index(drop=True)
df12

Unnamed: 0,visit_id,patient_id,visit_month,GO:0003094,GO:0120020,GO:0014911,GO:0045947,GO:0038020,GO:0070062,GO:0097114,...,YYWGGQYTWDMAK,updrs_sum,updrs_1,updrs_2,updrs_3,updrs_4,updrs_1_12m,updrs_2_12m,updrs_3_12m,updrs_4_12m
0,55_0,55,0,1940704,28883082,1156340,0,334859,687250139,15487900,...,14898.40,,10.0,6.0,15.0,,10.0,10.0,41.0,-5.0
1,942_12,942,12,936951,23424722,809703,82410,204100,547894586,7939170,...,0.00,2.324176,5.0,2.0,25.0,-5.0,2.0,3.0,23.0,
2,1517_24,1517,24,2886877,23213864,1067990,57261,308177,527438313,11358100,...,26098.30,4.807692,19.0,11.0,28.0,3.0,20.0,17.0,31.0,8.0
3,1923_24,1923,24,8380762,36567403,984465,109431,293062,632706778,22364200,...,6290.76,,2.0,-5.0,1.0,,3.0,-5.0,1.0,
4,2660_0,2660,0,2465866,25134946,1125330,55047,238294,475294580,16005300,...,23491.10,,2.0,-5.0,-5.0,,-5.0,-5.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,64669_48,64669,48,3288493,21195402,1441500,0,337938,571594825,11464300,...,35447.70,6.269231,18.0,18.0,49.0,-5.0,15.0,15.0,38.0,-5.0
539,64674_0,64674,0,2217558,6748405,1286390,64857,308005,532133772,818945,...,0.00,,5.0,1.0,13.0,,9.0,9.0,18.0,
540,64674_12,64674,12,3339788,20331075,1580310,0,290296,459431991,4387500,...,28021.00,,9.0,9.0,18.0,,12.0,12.0,26.0,
541,65043_0,65043,0,972199,36696770,942766,37584,365236,640816559,24488400,...,10915.80,,2.0,6.0,16.0,,4.0,7.0,14.0,-5.0


In [24]:
# Model for predicting updrs_1 month +12

if fill_na == 'nofill':
    df12_x = df12.dropna(subset=['updrs_1_12m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_1_12 = df12_x.iloc[:, visit:-8]
    y_updrs_1_12 = df12_x.iloc[:, -4]
else:
    X_updrs_1_12 = df12.iloc[:, visit:-8]
    y_updrs_1_12 = df12.iloc[:, -4]
    
model_updrs_1_12 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_1_12):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_1_12.values[train_index], X_updrs_1_12.values[test_index]
        y_train, y_test = y_updrs_1_12[train_index], y_updrs_1_12[test_index]

        # fit the model on the training set
        model_updrs_1_12.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_1_12.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_1_12
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_1_12 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_1_12.fit(X_updrs_1_12, y_updrs_1_12)


In [25]:
# Model for predicting updrs_2 month +12

if fill_na == 'nofill':
    df12_x = df12.dropna(subset=['updrs_2_12m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_2_12 = df12_x.iloc[:, visit:-8]
    y_updrs_2_12 = df12_x.iloc[:, -3]
else:
    X_updrs_2_12 = df12.iloc[:, visit:-8]
    y_updrs_2_12 = df12.iloc[:, -3]
    
model_updrs_2_12 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_2_12):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_2_12.values[train_index], X_updrs_2_12.values[test_index]
        y_train, y_test = y_updrs_2_12[train_index], y_updrs_2_12[test_index]

        # fit the model on the training set
        model_updrs_2_12.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_2_12.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_2_12
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_2_12 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_2_12.fit(X_updrs_2_12, y_updrs_2_12)


In [26]:
# Model for predicting updrs_3 month +12

if fill_na == 'nofill':
    df12_x = df12.dropna(subset=['updrs_3_12m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_3_12 = df12_x.iloc[:, visit:-8]
    y_updrs_3_12 = df12_x.iloc[:, -2]
else:
    X_updrs_3_12 = df12.iloc[:, visit:-8]
    y_updrs_3_12 = df12.iloc[:, -2]
    
model_updrs_3_12 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_3_12):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_3_12.values[train_index], X_updrs_3_12.values[test_index]
        y_train, y_test = y_updrs_3_12[train_index], y_updrs_3_12[test_index]

        # fit the model on the training set
        model_updrs_3_12.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_3_12.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_3_12
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_3_12 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_3_12.fit(X_updrs_3_12, y_updrs_3_12)


In [27]:
# Model for predicting updrs_4 month +12

if fill_na == 'nofill':
    df12_x = df12.dropna(subset=['updrs_4_12m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_4_12 = df12_x.iloc[:, visit:-8]
    y_updrs_4_12 = df12_x.iloc[:, -1]
else:
    X_updrs_4_12 = df12.iloc[:, visit:-8]
    y_updrs_4_12 = df12.iloc[:, -1]
    
model_updrs_4_12 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_4_12):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_4_12.values[train_index], X_updrs_4_12.values[test_index]
        y_train, y_test = y_updrs_4_12[train_index], y_updrs_4_12[test_index]

        # fit the model on the training set
        model_updrs_4_12.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_4_12.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_4_12
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_4_12 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_4_12.fit(X_updrs_4_12, y_updrs_4_12)


# Models to predict +24 months from analysis

In [28]:
# prepare a dataset with a difference of 24 months

df24 = df.copy().sort_values(['patient_id','visit_month'])
for i, row in df24.iterrows():
    patient_id = row['patient_id']
    visit_month = row['visit_month']
    visit_month_24 = visit_month + 24
    same_patient_month = df24[(df24['patient_id'] == patient_id) & (df24['visit_month'] == visit_month_24)]
    
    if not same_patient_month.empty:
        df24.at[i, 'updrs_1_24m'] = same_patient_month['updrs_1'].values[0]
        df24.at[i, 'updrs_2_24m'] = same_patient_month['updrs_2'].values[0]
        df24.at[i, 'updrs_3_24m'] = same_patient_month['updrs_3'].values[0]
        df24.at[i, 'updrs_4_24m'] = same_patient_month['updrs_4'].values[0]

# drop rows where all updrs 6m values are NaN
df24.dropna(subset=['updrs_1_24m', 'updrs_2_24m', 'updrs_3_24m', 'updrs_4_24m'], how='all', inplace=True)
df24 = df24.reset_index(drop=True)
df24

Unnamed: 0,visit_id,patient_id,visit_month,GO:0003094,GO:0120020,GO:0014911,GO:0045947,GO:0038020,GO:0070062,GO:0097114,...,YYWGGQYTWDMAK,updrs_sum,updrs_1,updrs_2,updrs_3,updrs_4,updrs_1_24m,updrs_2_24m,updrs_3_24m,updrs_4_24m
0,55_12,55,12,1987591,28499891,1229910,65762,404240,688565082,16794500,...,21841.1,4.467033,10.0,10.0,41.0,-5.0,17.0,18.0,51.0,-5.0
1,942_24,942,24,937573,24334809,654595,76931,210042,561516992,7161030,...,21847.6,,2.0,3.0,23.0,,2.0,6.0,35.0,-5.0
2,1517_0,1517,0,3376458,15641293,829051,55379,288262,550756275,7283210,...,20695.0,3.926740,11.0,6.0,25.0,5.0,19.0,11.0,28.0,3.0
3,1517_36,1517,36,2108032,18129465,920823,73180,290192,533470407,10160500,...,13430.0,6.393773,20.0,17.0,31.0,8.0,19.0,18.0,39.0,10.0
4,1923_0,1923,0,11207579,36661397,1141030,64302,335329,658350329,21842900,...,0.0,,2.0,-5.0,-5.0,,2.0,-5.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,63889_36,63889,36,4442132,14515295,957408,73325,430556,602192714,7724060,...,0.0,2.298535,8.0,3.0,18.0,1.0,6.0,5.0,28.0,2.0
431,64674_0,64674,0,2217558,6748405,1286390,64857,308005,532133772,818945,...,0.0,,5.0,1.0,13.0,,12.0,12.0,26.0,
432,64674_24,64674,24,2583523,19028850,1540120,66883,296192,471802601,5370280,...,24711.6,,12.0,12.0,26.0,,11.0,17.0,46.0,1.0
433,65043_0,65043,0,972199,36696770,942766,37584,365236,640816559,24488400,...,10915.8,,2.0,6.0,16.0,,4.0,8.0,,-5.0


In [29]:
# Model for predicting updrs_1 month +24

if fill_na == 'nofill':
    df24_x = df24.dropna(subset=['updrs_1_24m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_1_24 = df24_x.iloc[:, visit:-8]
    y_updrs_1_24 = df24_x.iloc[:, -4]
else:
    X_updrs_1_24 = df24.iloc[:, visit:-8]
    y_updrs_1_24 = df24.iloc[:, -4]
    
model_updrs_1_24 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_1_24):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_1_24.values[train_index], X_updrs_1_24.values[test_index]
        y_train, y_test = y_updrs_1_24[train_index], y_updrs_1_24[test_index]

        # fit the model on the training set
        model_updrs_1_24.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_1_24.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_1_24
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_1_24 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
    
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_1_24.fit(X_updrs_1_24, y_updrs_1_24)
    

In [30]:
# Model for predicting updrs_2 month +24

if fill_na == 'nofill':
    df24_x = df24.dropna(subset=['updrs_2_24m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_2_24 = df24_x.iloc[:, visit:-8]
    y_updrs_2_24 = df24_x.iloc[:, -3]
else:
    X_updrs_2_24 = df24.iloc[:, visit:-8]
    y_updrs_2_24 = df24.iloc[:, -3]
    
model_updrs_2_24 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_2_24):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_2_24.values[train_index], X_updrs_2_24.values[test_index]
        y_train, y_test = y_updrs_2_24[train_index], y_updrs_2_24[test_index]

        # fit the model on the training set
        model_updrs_2_24.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_2_24.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_2_24
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_2_24 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
        
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_2_24.fit(X_updrs_2_24, y_updrs_2_24)


In [31]:
# Model for predicting updrs_3 month +24

if fill_na == 'nofill':
    df24_x = df24.dropna(subset=['updrs_3_24m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_3_24 = df24_x.iloc[:, visit:-8]
    y_updrs_3_24 = df24_x.iloc[:, -2]
else:
    X_updrs_3_24 = df24.iloc[:, visit:-8]
    y_updrs_3_24 = df24.iloc[:, -2]
    
model_updrs_3_24 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_3_24):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_3_24.values[train_index], X_updrs_3_24.values[test_index]
        y_train, y_test = y_updrs_3_24[train_index], y_updrs_3_24[test_index]

        # fit the model on the training set
        model_updrs_3_24.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_3_24.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_3_24
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_3_24 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
    
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()
    
elif point == 'final':
    
    # fit the model on the training set
    model_updrs_3_24.fit(X_updrs_3_24, y_updrs_3_24)


In [32]:
# Model for predicting updrs_4 month +24

if fill_na == 'nofill':
    df24_x = df24.dropna(subset=['updrs_4_24m']).reset_index(drop=True)

    # Extract features and target
    X_updrs_4_24 = df24_x.iloc[:, visit:-8]
    y_updrs_4_24 = df24_x.iloc[:, -1]
else:
    X_updrs_4_24 = df24.iloc[:, visit:-8]
    y_updrs_4_24 = df24.iloc[:, -1]
    
model_updrs_4_24 = xgb.XGBRegressor(max_depth=8,
                             learning_rate=0.01,
                             colsample_bytree=0.7,
                             subsample= 0.8,
                             n_estimators=250,
                             reg_alpha=0.15,
                             reg_lambda=0.15)

if point == 'CV':
    
    # define the KFold cross-validation iterator with shuffle and random state
    kf = KFold(n_splits=5, shuffle=True, random_state=13)

    # fit the model on each cross-validation subset and evaluate its performance
    smape_plus_1_scores = []
    models_batch = {}
    y_preds = []
    y_tests = []
    i=0
    for train_index, test_index in kf.split(X_updrs_4_24):
        # split the data into training and testing sets
        X_train, X_test = X_updrs_4_24.values[train_index], X_updrs_4_24.values[test_index]
        y_train, y_test = y_updrs_4_24[train_index], y_updrs_4_24[test_index]

        # fit the model on the training set
        model_updrs_4_24.fit(X_train, y_train)

        # evaluate the model on the testing set using smape+1 metric
        y_pred = model_updrs_4_24.predict(X_test)
        y_pred = np.clip(y_pred, 0, None)
        y_test = np.clip(y_test, 0, None)
        y_preds.append(y_pred)
        y_tests.append(y_test)
        score = smape_plus_one(y_test, y_pred)
        smape_plus_1_scores.append(score)

        models_batch[i] = model_updrs_4_24
        i += 1  

    best_index = smape_plus_1_scores.index(min(smape_plus_1_scores))
    model_updrs_4_24 = models_batch[best_index]
    print("best smape+1:",smape_plus_1_scores[best_index])
    print(smape_plus_1_scores)
    
    # assume y_pred and y_test are numpy arrays with the predicted and true values
    residuals = y_tests[best_index] - y_preds[best_index]
    plt.scatter(y_preds[best_index], residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.show()

elif point == 'final':
    
    # fit the model on the training set
    model_updrs_4_24.fit(X_updrs_4_24, y_updrs_4_24)
    

# load test files

In [33]:
test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')
sample_submission = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv')

In [34]:
# get the GO terms for the proteins
def get_go_terms(proteins_df):
    # Load GO terms from go-basic.obo file
    go_terms = {}
    with open('/kaggle/input/models/go-basic.obo') as f:
        data = f.read()
        for item in data.split("[Term]"):
            if "id: GO:" in item:
                match = re.search("id: (GO:\d+)\n", item)
                go_id = match.group(1)
                match = re.search("name: (.+)\n", item)
                go_name = match.group(1)
                go_terms[go_id] = go_name

    # Load protein-GO mappings from protein_go.txt file
    protein_go = {}
    with open('/kaggle/input/models/protein_go.txt') as f:
        for line in f:
            fields = line.strip().split()
            if len(fields) == 2:
                protein_id = fields[0]
                go_id = fields[1]
                if protein_id in protein_go:
                    protein_go[protein_id].append(go_id)
                else:
                    protein_go[protein_id] = [go_id]

    go_terms_all = []
    # Iterate through each row of train_proteins_df and create binary columns for each unique GO term
    for i, row in proteins_df.iterrows():
        uni_prot = row['UniProt']
        go_terms_present = protein_go.get(uni_prot, [])
        go_terms_all.append(go_terms_present)
    
    return go_terms_all

In [35]:
# group the proteins by the go terms
def group_proteins_df(proteins_df, go_terms_all):

    proteins_df['go_terms'] = go_terms_all

    # Create a set of all unique GO terms in the 'go_terms' column
    unique_go_terms = set()
    for go_terms in proteins_df['go_terms']:
        unique_go_terms.update(go_terms)

    # Loop through each unique GO term and create a new column for it
    for go_term in unique_go_terms:
        proteins_df[go_term] = 0

    # Fill in the new columns with 1 if the row's 'go_terms' array contains the GO term
    for index, row in proteins_df.iterrows():
        for go_term in row['go_terms']:
            proteins_df.at[index, go_term] = row['NPX']

    grouped_df = proteins_df.groupby('visit_id').sum().iloc[:,6:].reset_index(drop=False)
    grouped_df.to_csv('go_proteins_df_grouped.csv', index=False)
    
    return grouped_df

In [36]:
def prepare_test_df(test_proteins,test_peptides,df):
    
    # pivoted peptides
    pivoted_pept_df = pd.pivot_table(test_peptides, values='PeptideAbundance', index='visit_id', columns='Peptide', aggfunc=np.sum, fill_value=0)
    pivoted_pept_df.reset_index()
    
    # Prepare pivoted df of proteins
    go_terms = get_go_terms(test_proteins)
    df2 = group_proteins_df(test_proteins, go_terms)
    
    df2 = df2.merge(pivoted_pept_df, on='visit_id')
    
    if sum_col and not med_cols:
        dfcols = df.iloc[:, :-5].columns
    elif not sum_col and med_cols:
        dfcols = df.iloc[:, :-8].columns
    elif sum_col and med_cols:
        dfcols = df.iloc[:, :-9].columns
    elif not sum_col and not med_cols:
        dfcols = df.iloc[:, :-4].columns
        
    new_df = pd.DataFrame(columns=dfcols)
        
    # iterate over columns in df
    for col in dfcols:

        # check if the column exists in df2
        if col in df2.columns:
            # if it exists, copy the column from df2
            new_df[col] = df2[col]
        else:
            # if it doesn't exist, fill the column with NaN values
            new_df[col] = 0
        
    print("Test columns are the same as train columns?",set(dfcols) == set(new_df.columns),len(df.iloc[:,3:-5].columns),len(new_df.iloc[:, 3:].columns))

    # now we have a test dataframe ready to predict drugs usage and updrs
    
    # split the visit_id column into patient_id and visit_month columns
    new_df[['patient_id', 'visit_month']] = new_df['visit_id'].str.split('_', expand=True)
    new_df['visit_month'] = new_df['visit_month'].astype(int)
    
    return new_df

In [37]:
def pred_with_analysis(test_df):
    
    data = test_df[['visit_id','visit_month']]
    
    # Add median columns
    if med_cols:
        test_df[['updrs_1_med','updrs_2_med','updrs_3_med','updrs_4_med']] = test_df.apply(get_updrs, axis=1)

    # Add the predicted cluster labels as a new column to the test dataframe
    if sum_col:
        test_df['updrs_sum'] = model_sum.predict(test_df.iloc[:, visit:])
    
    # predict updrs 0
    data['updrs_1_plus_0'] = np.floor(np.clip(model_updrs_1_0.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_2_plus_0'] = np.floor(np.clip(model_updrs_2_0.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_3_plus_0'] = np.floor(np.clip(model_updrs_3_0.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_4_plus_0'] = np.floor(np.clip(model_updrs_4_0.predict(test_df.iloc[:, visit:]), 0, None))
    
    # predict updrs 6m
    data['updrs_1_plus_6'] = np.floor(np.clip(model_updrs_1_6.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_2_plus_6'] = np.floor(np.clip(model_updrs_2_6.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_3_plus_6'] = np.floor(np.clip(model_updrs_3_6.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_4_plus_6'] = np.floor(np.clip(model_updrs_4_6.predict(test_df.iloc[:, visit:]), 0, None))
    
    # predict updrs 12m
    data['updrs_1_plus_12'] = np.floor(np.clip(model_updrs_1_12.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_2_plus_12'] = np.floor(np.clip(model_updrs_2_12.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_3_plus_12'] = np.floor(np.clip(model_updrs_3_12.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_4_plus_12'] = np.floor(np.clip(model_updrs_4_12.predict(test_df.iloc[:, visit:]), 0, None))
    
    # predict updrs 24m
    data['updrs_1_plus_24'] = np.floor(np.clip(model_updrs_1_24.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_2_plus_24'] = np.floor(np.clip(model_updrs_2_24.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_3_plus_24'] = np.floor(np.clip(model_updrs_3_24.predict(test_df.iloc[:, visit:]), 0, None))
    data['updrs_4_plus_24'] = np.floor(np.clip(model_updrs_4_24.predict(test_df.iloc[:, visit:]), 0, None))
    
    return data

In [38]:
def pred_data_without_analysis(test, data_with_analysis):
    
    data = test[['visit_id', 'visit_month']].drop_duplicates().reset_index(drop=True)
    data = data[~data['visit_id'].isin(data_with_analysis['visit_id'])]
    
    # Fill with the medians
    data[['updrs_1_plus_0','updrs_2_plus_0','updrs_3_plus_0','updrs_4_plus_0']] = data.apply(get_updrs, axis=1)
            
    data['visit_month'] = data['visit_month']+6
    
    # creating the column names for the updrs values based on the target_visit_month_medians dataframe
    data[['updrs_1_plus_6','updrs_2_plus_6','updrs_3_plus_6','updrs_4_plus_6']] = data.apply(get_updrs, axis=1)

    data['visit_month'] = data['visit_month']+6
    
    # creating the column names for the updrs values based on the target_visit_month_medians dataframe
    data[['updrs_1_plus_12','updrs_2_plus_12','updrs_3_plus_12','updrs_4_plus_12']] = data.apply(get_updrs, axis=1)
    
    data['visit_month'] = data['visit_month']+12
    
    # creating the column names for the updrs values based on the target_visit_month_medians dataframe
    data[['updrs_1_plus_24','updrs_2_plus_24','updrs_3_plus_24','updrs_4_plus_24']] = data.apply(get_updrs, axis=1)
    
    return data

In [39]:
def convert_for_submition(data):
    
    submission_df = pd.DataFrame(columns=['prediction_id','rating'])
    
    for i, row in data.iterrows():
        
        for updrs in ['updrs_1_plus','updrs_2_plus','updrs_3_plus','updrs_4_plus']:
            
            for month in [0,6,12,24]:
                
                prediction_id = row['visit_id']+'_'+updrs+'_'+str(month)+'_months'
                rating = row[updrs+'_'+str(month)]
                submission_df.loc[len(submission_df)] = [prediction_id,rating]
                
    
    return submission_df

In [40]:
# check

def get_predictions(test_proteins,test_peptides,test,sample_submission):
    
    data_with_analysis = prepare_test_df(test_proteins,test_peptides,df)
    data_with_analysis = pred_with_analysis(data_with_analysis)
    data_without_analysis = pred_data_without_analysis(test,data_with_analysis)
    data_together = pd.concat([data_with_analysis,data_without_analysis], axis=0)
    data = convert_for_submition(data_together)

    for i,row in sample_submission.iterrows():

        sample_submission.loc[i, 'rating'] = data[data['prediction_id'] == row['prediction_id']]['rating'].iloc[0]

    if set(data['prediction_id']) == set(sample_submission['prediction_id']):
        print("Everything OK")
    else:
        print(set(sample_submission['prediction_id'])-set(data['prediction_id']))
    
    # retornar sample_submission
    return sample_submission

get_predictions(test_proteins,test_peptides,test,sample_submission)

Test columns are the same as train columns? True 3085 3085
Everything OK


Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,4.5,0
1,3342_0_updrs_1_plus_6_months,6.0,0
2,3342_0_updrs_1_plus_12_months,6.0,0
3,3342_0_updrs_1_plus_24_months,6.0,0
4,3342_0_updrs_2_plus_0_months,4.0,0
...,...,...,...
59,50423_6_updrs_3_plus_24_months,22.0,6
60,50423_6_updrs_4_plus_0_months,0.0,6
61,50423_6_updrs_4_plus_6_months,0.0,6
62,50423_6_updrs_4_plus_12_months,0.0,6


In [41]:
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test() 

In [42]:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    
    result = get_predictions(test_proteins,test_peptides,test, sample_submission)
        
    env.predict(result)   # register your predictions

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Test columns are the same as train columns? True 3085 3085
Everything OK
Test columns are the same as train columns? True 3085 3085
Everything OK
