# Process Exported Collection Data from Proteinbase

Source: https://proteinbase.com/api/proteins/download?collectionId=019adf02-13db-93aa-9f8f-2321ce34c976&slug=nipah-binder-competition-all-submissions

In [None]:
import pandas as pd
import json

In [None]:
## Load in the collection data downloaded from Proteinbase
collection_df = pd.read_csv('../data/proteinbase_collection_nipah-binder-competition-all-submissions.csv')

collection_df.head()

In [None]:
## Define function to parse JSON evaluations and add ID
def parse_json_evaluations(row):
    evals_df = pd.json_normalize(json.loads(row['evaluations']), max_level=0)
    evals_df['id'] = row['id']
    # pivoted = evals_df.pivot(index='id', columns='metric', values='value')
    # return pivoted
    return evals_df

In [None]:
## Loop through each row to expand JSON evaluations column into separate columns
evals_expanded = collection_df.apply(lambda row: parse_json_evaluations(row), axis=1)
evals_expanded = pd.concat([x for x in evals_expanded], ignore_index=True)

evals_expanded.head()

In [None]:
## Pivot wider using id, metric, value
pivoted_evals = evals_expanded.pivot_table(index='id', columns='metric', values='value', aggfunc='first').reset_index()

pivoted_evals.head()

In [None]:
## Join back to original collection dataframe
final_df = collection_df.merge(pivoted_evals, on='id', how='left')
final_df = final_df.drop(columns=['evaluations'])

final_df.head()

In [None]:
## Save to file
final_df.to_csv('../data/proteinbase_collection_nipah-binder-competition-all-submissions_processed.csv', index=False)