In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import xgboost as xg

visit_id | ---UniProt (~218)--- | updrs_x

In [20]:
def read_data():
    train_clinical_data = pd.read_csv('data/train_clinical_data.csv')
    train_proteins = pd.read_csv('data/train_proteins.csv')
    return train_clinical_data, train_proteins

In [21]:
def data_transform(df):
    # Applying log transformation
    df.iloc[:, 1:-2] = df.iloc[:, 1:-2].apply(np.log)
    df = df.replace(float("-inf"), 0)
    

    #normalizing to (0,1)
    scaler = MinMaxScaler()
    scaler.fit(df.iloc[:, 1:-2])
    transformed_values = scaler.transform(df.iloc[:, 1:-2])
    df.iloc[:, 1:-2] = transformed_values

    #increasing the spread
    df.iloc[:, 1:-2] = df.iloc[:, 1:-2].apply(lambda x : np.power(x, 4)) 
    df = df.round(4)
     
    return df

In [61]:
def data_prep(train_clinical_data, train_proteins, label):
    
    df = train_clinical_data.merge(train_proteins, how='inner', on='visit_id')
    df.head()
    df_pivot = df.pivot(index=['visit_id', 'visit_month_x'], values='NPX', columns=['UniProt'])
    df_pivot.fillna(0, inplace=True)
    df_updrs = df[['visit_id', 'visit_month_x',label]].groupby(by='visit_id').mean().reset_index()
    df_pivot = df_pivot.merge(df_updrs, how='left', on='visit_id')

    df_pivot = data_transform(df_pivot)
    if label in ['updrs_3', 'updrs_4']:
        # print(f"{label} : {df_pivot[label].isna().sum()}")
        df_pivot.dropna(inplace = True)
        # print(f"{label} : {df_pivot[label].isna().sum()}")
    X_train, X_test, y_train, y_test = train_test_split(df_pivot[df_pivot.columns.difference(['visit_id', label])], 
                                                    df_pivot[[label]], test_size=0.1, shuffle=True, random_state=42)
    return X_train, X_test, y_train, y_test

In [23]:
def smap_error(y_true, y_pred):
    """
    Calculate symmetric mean absolute percentage error from given ground-truth and predictions
    Parameters
    ----------
    y_true: array-like of shape (n_samples)
        Array of ground-truth values
        
    y_pred: array-like of shape (n_samples)
        Array of prediction values
        
    Returns
    -------
    smape: float
        Symmetric mean absolute percentage error
    """

    smape = 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

    return smape

In [24]:
def train(model, X_train, y_train):
    model.fit(X_train, y_train.squeeze())
    return model

In [25]:
def predict(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

In [151]:
LABELS = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

def pipeline(train_clinical_data, train_proteins):
    
    model_params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
    }
    res_dict = {}
    for label in LABELS:
        X_train, X_test, y_train, y_test = data_prep(train_clinical_data, train_proteins, label)
        
        X_train, visit_month_train = X_train.iloc[:, :-1], np.array(X_train.iloc[:, -1])
        X_test, visit_month_test = X_test.iloc[:, :-1], np.array(X_test.iloc[:, -1])

        # cov_matrix = np.cov(X_train.T)
        # eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

        # # Sort eigenvalues in descending order
        # idx = eigenvalues.argsort()[::-1]
        # eigenvalues = eigenvalues[idx]

        # # Plot scree plot
        # print(eigenvalues)
        # plt.plot(eigenvalues)
        # plt.xlabel('Component Number')
        # plt.ylabel('Eigenvalue')
        # plt.title('Scree Plot')
        # plt.show()
        # return

        pca = PCA(n_components=3, svd_solver='full')
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)

        X_train = np.concatenate((X_train, visit_month_train.reshape(-1, 1)), axis=1)
        X_test = np.concatenate((X_test, visit_month_test.reshape(-1, 1)), axis=1)
        
        # print(X_train.shape, X_test.shape, visit_month_test.shape, visit_month_train.shape)
        model = xg.XGBRegressor(**model_params)
        model = train(model, X_train, y_train)
        y_pred = predict(model, X_test)
        res_dict[label] = {
            "model" : model,
            "X_train" : X_train,
            "X_test" : X_test, 
            "y_train" : y_train, 
            "y_test" : y_test,
            "y_pred" : y_pred.reshape(-1, 1)
        }
    return res_dict

In [152]:
train_clinical_data, train_proteins = read_data()
results = pipeline(train_clinical_data, train_proteins)

Parameters: { "loss", "min_samples_split" } are not used.

Parameters: { "loss", "min_samples_split" } are not used.

Parameters: { "loss", "min_samples_split" } are not used.

Parameters: { "loss", "min_samples_split" } are not used.



In [153]:
mean_squared_error(results['updrs_1']['y_pred'], results['updrs_1']['y_test'])

18.547496881049963

In [154]:
y_pred = np.concatenate([results[i]['y_pred'] for i in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']])
y_test = np.concatenate([results[i]['y_test'] for i in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']])

In [155]:
smap_error(y_pred, y_test)

100.15719967488492

In [156]:
results['updrs_1']['X_test']

array([[-3.74038932e-01,  4.59683941e-03,  1.14186414e+00,
         4.80000000e+01],
       [-5.64668322e-01,  3.72745570e-01, -4.06226924e-01,
         1.20000000e+01],
       [ 5.05674204e-01, -3.04683714e-01, -5.54444391e-01,
         0.00000000e+00],
       [ 1.17196543e+00, -6.18189218e-01, -3.25753893e-01,
         0.00000000e+00],
       [-1.02062399e-01, -2.45280091e-01,  9.62547842e-02,
         3.60000000e+01],
       [ 8.38564497e-01, -7.67702892e-01,  9.50396429e-01,
         3.60000000e+01],
       [-9.07341694e-02,  2.99147694e-02, -2.04862267e-01,
         0.00000000e+00],
       [ 4.59598930e-01,  1.40150154e+00,  1.67558234e-01,
         4.80000000e+01],
       [ 2.04237115e+00,  5.37281922e-03,  5.80564965e-01,
         2.40000000e+01],
       [-6.29259382e-01,  3.30105548e-01,  4.29445706e-01,
         1.20000000e+01],
       [-1.29873246e+00,  2.70959880e-01, -3.20194368e-01,
         1.20000000e+01],
       [-9.64356192e-01,  1.72615782e-01, -1.07371560e-01,
      

## Testing


In [16]:
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')
sample_submission = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv')
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')

In [17]:
test.head()

Unnamed: 0,visit_id,visit_month,patient_id,updrs_test,row_id,group_key
0,3342_0,0,3342,updrs_1,3342_0_updrs_1,0
1,3342_0,0,3342,updrs_2,3342_0_updrs_2,0
2,3342_0,0,3342,updrs_3,3342_0_updrs_3,0
3,3342_0,0,3342,updrs_4,3342_0_updrs_4,0
4,50423_0,0,50423,updrs_1,50423_0_updrs_1,0


In [18]:
sample_submission.head()

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0


In [19]:
test_proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,group_key
0,50423_0,0,50423,O00391,33127.9,0
1,50423_0,0,50423,O00533,490742.0,0
2,50423_0,0,50423,O00584,43615.3,0
3,50423_0,0,50423,O14773,16486.6,0
4,50423_0,0,50423,O14791,2882.42,0


In [25]:
train_proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [None]:
def preds_for_inference()

In [10]:
import amp_pd_peptide
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


NameError: name 'sample_prediction' is not defined

In [16]:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    ressss = pipeline(test, test_proteins)

You must call `predict()` successfully before you can continue with `iter_test()`


TypeError: cannot unpack non-iterable NoneType object

In [22]:
tup= next(iter_test)

You must call `predict()` successfully before you can continue with `iter_test()`


In [24]:
print(tup)

None
