In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import mean_squared_error

In [2]:
def smap_error(y_true, y_pred):
    """
    Calculate symmetric mean absolute percentage error from given ground-truth and predictions
    Parameters
    ----------
    y_true: array-like of shape (n_samples)
        Array of ground-truth values
        
    y_pred: array-like of shape (n_samples)
        Array of prediction values
        
    Returns
    -------
    smape: float
        Symmetric mean absolute percentage error
    """

    smape = 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

    return smape


In [3]:
train_clinical_data = pd.read_csv('./train_clinical_data.csv')[
    ['visit_id', 'patient_id', 'visit_month', 'upd23b_clinical_state_on_medication', 'updrs_1', 'updrs_2','updrs_3', 'updrs_4']]

train_proteins = pd.read_csv('./train_proteins.csv')

In [4]:
df = train_clinical_data.merge(train_proteins, how='inner', on='visit_id')

In [5]:
df.columns

Index(['visit_id', 'patient_id_x', 'visit_month_x',
       'upd23b_clinical_state_on_medication', 'updrs_1', 'updrs_2', 'updrs_3',
       'updrs_4', 'visit_month_y', 'patient_id_y', 'UniProt', 'NPX'],
      dtype='object')

In [6]:
df

Unnamed: 0,visit_id,patient_id_x,visit_month_x,upd23b_clinical_state_on_medication,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_y,patient_id_y,UniProt,NPX
0,55_0,55,0,,10.0,6.0,15.0,,0,55,O00391,11254.3
1,55_0,55,0,,10.0,6.0,15.0,,0,55,O00533,732430.0
2,55_0,55,0,,10.0,6.0,15.0,,0,55,O00584,39585.8
3,55_0,55,0,,10.0,6.0,15.0,,0,55,O14498,41526.9
4,55_0,55,0,,10.0,6.0,15.0,,0,55,O14773,31238.0
...,...,...,...,...,...,...,...,...,...,...,...,...
223263,65043_48,65043,48,Off,7.0,6.0,13.0,0.0,48,65043,Q9UBX5,48796.4
223264,65043_48,65043,48,Off,7.0,6.0,13.0,0.0,48,65043,Q9UHG2,320821.0
223265,65043_48,65043,48,Off,7.0,6.0,13.0,0.0,48,65043,Q9UKV8,39046.7
223266,65043_48,65043,48,Off,7.0,6.0,13.0,0.0,48,65043,Q9Y646,20198.8


In [7]:
df_pivot = df.pivot(index='visit_id', values='NPX', columns=['UniProt'])
df_pivot.dropna(axis=1, inplace=True)
df_pivot = df_pivot.apply(lambda x: x/2000000)
df_updrs = df[['visit_id', 'updrs_1']].groupby(by='visit_id').mean().reset_index()
df_pivot = df_pivot.merge(df_updrs, how='left', on='visit_id')
df_pivot

Unnamed: 0,visit_id,O15240,P01009,P01011,P01023,P01024,P01042,P01834,P01876,P02647,...,P05090,P06396,P07602,P10909,P23142,P41222,Q12805,Q92520,Q9UHG2,updrs_1
0,10053_0,0.041501,6.41265,0.542385,0.502615,1.334870,0.270954,1.220400,0.937135,2.016325,...,2.043530,0.676360,0.365927,2.374890,0.411392,21.25585,0.525990,0.161248,0.088992,3.0
1,10053_12,0.098558,5.93560,0.567785,0.615495,1.680395,0.292185,1.015250,0.900035,2.555880,...,1.745710,0.695475,0.309386,2.111550,0.330529,17.05025,0.369123,0.248488,0.085866,4.0
2,10053_18,0.063253,7.15995,0.722215,0.666970,1.864430,0.485592,1.053370,0.754150,2.856590,...,2.548355,1.116200,0.439563,2.864005,0.485710,19.16230,0.421963,0.337276,0.122594,2.0
3,10138_12,0.078157,5.51245,0.862720,0.651295,2.138285,0.813535,2.035920,2.728050,11.412900,...,2.616540,1.267825,0.602195,3.764270,0.623115,17.07445,0.408174,0.300934,0.114616,3.0
4,10138_24,0.075584,7.07260,0.772220,0.610530,2.285155,0.650840,1.210760,1.516440,5.568300,...,1.926260,1.221685,0.229582,4.352380,0.637475,22.99140,0.299441,0.284986,0.088361,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,8699_24,0.038747,6.92095,0.868305,0.656155,2.888095,0.710965,1.763795,2.243155,12.360750,...,2.370965,0.945955,0.597460,3.457110,0.607180,25.26955,0.265487,0.263418,0.060581,11.0
1064,942_12,0.043424,5.05055,0.692590,0.541835,1.499435,0.388618,0.952285,0.434986,6.961700,...,2.644305,0.759390,0.509790,3.180950,0.577385,15.53545,0.332737,0.284921,0.117047,5.0
1065,942_24,0.057386,5.60235,0.940705,0.527140,1.953250,0.472583,1.005130,0.432258,7.876700,...,2.676395,0.737555,0.466618,2.710815,0.494719,14.62185,0.321875,0.272462,0.128327,2.0
1066,942_48,0.041121,5.99400,0.832830,0.546335,1.808685,0.490994,1.168680,0.505435,10.820950,...,3.052910,0.803555,0.499794,3.116665,0.333377,13.03355,0.246329,0.231614,0.116151,2.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_pivot[df_pivot.columns.difference(['visit_id', 'updrs_1'])], 
                                                    df_pivot[['updrs_1']], test_size=0.1, shuffle=True, random_state=42)

In [9]:
svr = svm.SVR()
svr.fit(X_train, y_train.squeeze())

In [10]:
y_pred = svr.predict(X_test)

In [122]:
smap_error(y_pred=np.array(y_pred), y_true=np.array(y_test))

7611.740560508906

In [11]:
mean_squared_error(y_test, y_pred)

19.67032960068015