In [58]:
import numpy as np
import pandas as pd

In [59]:
train_clinical_data = pd.read_csv('train_clinical_data.csv')
train_peptides = pd.read_csv('train_peptides.csv')
train_proteins = pd.read_csv('train_proteins.csv')

In [60]:
# Data preprocessing as before...
# Fill missing values for numeric columns with the mean
numeric_columns = train_clinical_data.select_dtypes(include=np.number).columns
train_clinical_data[numeric_columns] = train_clinical_data[numeric_columns].fillna(train_clinical_data[numeric_columns].mean())

In [61]:
# Fill missing values for non-numeric columns with the most frequent value (mode)
non_numeric_columns = train_clinical_data.select_dtypes(exclude=np.number).columns
train_clinical_data[non_numeric_columns] = train_clinical_data[non_numeric_columns].fillna(train_clinical_data[non_numeric_columns].mode().iloc[0])


In [62]:
train_clinical_data.shape

(2615, 8)

In [63]:
train_clinical_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,1.861763,On
1,55_3,55,3,10.0,7.0,25.0,1.861763,On
2,55_6,55,6,8.0,10.0,34.0,1.861763,On
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [64]:
train_peptides.shape


(981834, 6)

In [65]:
train_peptides.drop(columns='visit_id', inplace=True)

In [66]:
train_peptides.head()

Unnamed: 0,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,0,55,O00533,GNPEPTFSWTK,102060.0
2,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,0,55,O00533,SMEQNGPGLEYR,30838.7


In [67]:
train_proteins.shape

(232741, 5)

In [68]:
train_proteins.drop(columns='visit_id', inplace=True)

In [69]:
train_proteins.head()

Unnamed: 0,visit_month,patient_id,UniProt,NPX
0,0,55,O00391,11254.3
1,0,55,O00533,732430.0
2,0,55,O00584,39585.8
3,0,55,O14498,41526.9
4,0,55,O14773,31238.0


In [76]:

train = pd.merge(train_proteins, train_peptides, on=['UniProt', 'visit_month', 'patient_id'], how='inner')

In [77]:
train.head()

Unnamed: 0,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,0,55,O00533,732430.0,GNPEPTFSWTK,102060.0
2,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.7


In [78]:
train.shape

(981834, 6)

In [79]:
train_clinical_data.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,1.861763,On
1,55_3,55,3,10.0,7.0,25.0,1.861763,On
2,55_6,55,6,8.0,10.0,34.0,1.861763,On
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [80]:
train_clinical_data.drop(columns='visit_id', inplace=True)

In [82]:
train_clinical_data.head()

Unnamed: 0,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55,0,10.0,6.0,15.0,1.861763,On
1,55,3,10.0,7.0,25.0,1.861763,On
2,55,6,8.0,10.0,34.0,1.861763,On
3,55,9,8.0,9.0,30.0,0.0,On
4,55,12,10.0,10.0,41.0,0.0,On


In [83]:
train_clinical_data.drop(columns=['upd23b_clinical_state_on_medication'], inplace=True)

In [84]:
train_clinical_data.head()

Unnamed: 0,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55,0,10.0,6.0,15.0,1.861763
1,55,3,10.0,7.0,25.0,1.861763
2,55,6,8.0,10.0,34.0,1.861763
3,55,9,8.0,9.0,30.0,0.0
4,55,12,10.0,10.0,41.0,0.0


In [85]:
train_clinical_data.shape

(2615, 6)

In [91]:
train1 = pd.merge(train, train_clinical_data, on=[ 'visit_month', 'patient_id'], how='inner')

In [92]:
train1.shape

(941744, 10)

In [93]:
train1.head()

Unnamed: 0,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance,updrs_1,updrs_2,updrs_3,updrs_4
0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.3,10.0,6.0,15.0,1.861763
1,0,55,O00533,732430.0,GNPEPTFSWTK,102060.0,10.0,6.0,15.0,1.861763
2,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0,10.0,6.0,15.0,1.861763
3,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9,10.0,6.0,15.0,1.861763
4,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.7,10.0,6.0,15.0,1.861763


In [94]:
train1.drop(columns=['UniProt','Peptide'], inplace= True)

In [95]:
train1.to_csv('merged_train.csv', index=False)

##Merging the files of test

In [138]:
test_peptides = pd.read_csv('test_peptides.csv')
test_proteins = pd.read_csv('test_proteins.csv')

In [139]:
test_peptides.drop(columns='visit_id', inplace=True)

In [140]:
test_peptides.head()

Unnamed: 0,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,group_key
0,0,50423,O00391,AHFSPSNIILDFPAAGSAAR,22226.3,0
1,0,50423,O00391,NEQEQPLGQWHLS,10901.6,0
2,0,50423,O00533,GNPEPTFSWTK,51499.4,0
3,0,50423,O00533,IEIPSSVQQVPTIIK,125492.0,0
4,0,50423,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,23174.2,0


In [141]:
test_peptides.shape

(2057, 6)

In [142]:
test_proteins.drop(columns='visit_id', inplace=True)

In [143]:
test_proteins.head()

Unnamed: 0,visit_month,patient_id,UniProt,NPX,group_key
0,0,50423,O00391,33127.9,0
1,0,50423,O00533,490742.0,0
2,0,50423,O00584,43615.3,0
3,0,50423,O14773,16486.6,0
4,0,50423,O14791,2882.42,0


In [144]:
test_proteins.shape

(453, 5)

In [145]:
test_proteins['group_key'].nunique()

2

In [146]:
test_peptides['group_key'].unique()

array([0, 6], dtype=int64)

In [147]:
test = pd.merge(test_proteins, test_peptides, on=['UniProt', 'visit_month', 'patient_id'], how='inner')

In [148]:
test.shape

(2057, 8)

In [149]:
test2 = pd.merge(test_proteins, test_peptides, on=['UniProt', 'visit_month', 'patient_id','group_key'], how='inner')

In [150]:
test2.shape

(2057, 7)

In [151]:
test.drop(columns=['UniProt','group_key_x','group_key_y','Peptide'], inplace=True)
test.head()

Unnamed: 0,visit_month,patient_id,NPX,PeptideAbundance
0,0,50423,33127.9,22226.3
1,0,50423,33127.9,10901.6
2,0,50423,490742.0,51499.4
3,0,50423,490742.0,125492.0
4,0,50423,490742.0,23174.2


In [152]:
test.shape

(2057, 4)

In [153]:
test.to_csv('merged_test.csv', index=False)

In [154]:
test.head()

Unnamed: 0,visit_month,patient_id,NPX,PeptideAbundance
0,0,50423,33127.9,22226.3
1,0,50423,33127.9,10901.6
2,0,50423,490742.0,51499.4
3,0,50423,490742.0,125492.0
4,0,50423,490742.0,23174.2


In [159]:
train1.head()

Unnamed: 0,visit_month,patient_id,NPX,PeptideAbundance,updrs_1,updrs_2,updrs_3,updrs_4
0,0,55,11254.3,11254.3,10.0,6.0,15.0,1.861763
1,0,55,732430.0,102060.0,10.0,6.0,15.0,1.861763
2,0,55,732430.0,174185.0,10.0,6.0,15.0,1.861763
3,0,55,732430.0,27278.9,10.0,6.0,15.0,1.861763
4,0,55,732430.0,30838.7,10.0,6.0,15.0,1.861763


In [160]:
test.shape

(2057, 4)

In [161]:
train1.shape

(941744, 8)

FOr Modelling

In [None]:
# Define the target variable
target = 'updrs_1'

# Scale the input features using MinMaxScaler
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train.drop(target, axis=1))

# Create sequences for the LSTM model
sequence_length = 10
X, y = [], []

for i in range(len(train_scaled) - sequence_length):
    X.append(train_scaled[i:i+sequence_length])
    y.append(train[target].iloc[i+sequence_length])

X, y = np.array(X), np.array(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mae')

# Train the LSTM model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, shuffle=False)
