In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import xgboost as xg

visit_id | ---UniProt (~218)--- | updrs_x

In [27]:
def read_data():
    train_clinical_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
    train_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
    return train_clinical_data, train_proteins

In [28]:
def data_transform(df):
    # Applying log transformation
    df.iloc[:, 1:-2] = df.iloc[:, 1:-2].apply(np.log)
    df = df.replace(float("-inf"), 0)
    

    #normalizing to (0,1)
    scaler = MinMaxScaler()
    scaler.fit(df.iloc[:, 1:-2])
    transformed_values = scaler.transform(df.iloc[:, 1:-2])
    df.iloc[:, 1:-2] = transformed_values

    #increasing the spread
    df.iloc[:, 1:-2] = df.iloc[:, 1:-2].apply(lambda x : np.power(x, 4)) 
    df = df.round(4)
     
    return df

In [29]:
def data_prep(train_clinical_data, train_proteins, label):
    
    df = train_clinical_data.merge(train_proteins, how='inner', on='visit_id')
    df.head()
    df_pivot = df.pivot(index=['visit_id', 'visit_month_x'], values='NPX', columns=['UniProt'])
    df_pivot.fillna(0, inplace=True)
    df_updrs = df[['visit_id', 'visit_month_x',label]].groupby(by='visit_id').mean().reset_index()
    df_pivot = df_pivot.merge(df_updrs, how='left', on='visit_id')

    df_pivot = data_transform(df_pivot)
    if label in ['updrs_3', 'updrs_4']:
        print(f"{label} : {df_pivot[label].isna().sum()}")
        df_pivot.dropna(inplace = True)
        print(f"{label} : {df_pivot[label].isna().sum()}")
    X_train, X_test, y_train, y_test = train_test_split(df_pivot[df_pivot.columns.difference(['visit_id', label])], 
                                                    df_pivot[[label]], test_size=0.1, shuffle=True, random_state=42)
    return X_train, X_test, y_train, y_test

In [30]:
def train(model, X_train, y_train):
    model.fit(X_train, y_train.squeeze())
    return model

In [31]:
def predict(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

In [34]:

def pipeline(train_clinical_data, train_proteins):
    LABELS = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
    model_params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
    res_dict = {}
    for label in LABELS:
        X_train, X_test, y_train, y_test = data_prep(train_clinical_data, train_proteins, label)
        model = xg.XGBRegressor(**model_params)
        model = train(model, X_train, y_train)
        y_pred = predict(model, X_test)
        res_dict[label] = {
            "model" : model,
            "X_train" : X_train,
            "X_test" : X_test, 
            "y_train" : y_train, 
            "y_test" : y_test,
            "y_pred" : y_pred
        }
    return res_dict

In [35]:
train_clinical_data, train_proteins = read_data()
results = pipeline(train_clinical_data, train_proteins)

Parameters: { "loss", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "loss", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


updrs_3 : 10
updrs_3 : 0
Parameters: { "loss", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you fi

In [39]:
mean_squared_error(results['updrs_1']['y_pred'], results['updrs_1']['y_test'])

14.835974199298553

## Testing


In [16]:
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')
sample_submission = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv')
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')

In [17]:
test.head()

Unnamed: 0,visit_id,visit_month,patient_id,updrs_test,row_id,group_key
0,3342_0,0,3342,updrs_1,3342_0_updrs_1,0
1,3342_0,0,3342,updrs_2,3342_0_updrs_2,0
2,3342_0,0,3342,updrs_3,3342_0_updrs_3,0
3,3342_0,0,3342,updrs_4,3342_0_updrs_4,0
4,50423_0,0,50423,updrs_1,50423_0_updrs_1,0


In [18]:
sample_submission.head()

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,0,0
1,3342_0_updrs_1_plus_6_months,0,0
2,3342_0_updrs_1_plus_12_months,0,0
3,3342_0_updrs_1_plus_24_months,0,0
4,3342_0_updrs_2_plus_0_months,0,0


In [19]:
test_proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,group_key
0,50423_0,0,50423,O00391,33127.9,0
1,50423_0,0,50423,O00533,490742.0,0
2,50423_0,0,50423,O00584,43615.3,0
3,50423_0,0,50423,O14773,16486.6,0
4,50423_0,0,50423,O14791,2882.42,0


In [25]:
train_proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [None]:
def preds_for_inference()

In [10]:
import amp_pd_peptide
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


NameError: name 'sample_prediction' is not defined

In [16]:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    ressss = pipeline(test, test_proteins)

You must call `predict()` successfully before you can continue with `iter_test()`


TypeError: cannot unpack non-iterable NoneType object

In [22]:
tup= next(iter_test)

You must call `predict()` successfully before you can continue with `iter_test()`


In [24]:
print(tup)

None
