In [54]:
# import necessary libraries
import os
import pandas as pd
import numpy as np

# ignore DtypeWarning warnings
import warnings
warnings.filterwarnings("ignore")


In [50]:
def filter_dataframe(df, kinase_name):
    df_new = df[df['KINASE_psp'] == kinase_name]
    filtered_df = df_new[df_new['SUB_MOD_RSD_psp'].values == df_new['Phosphosite_johnson'].values]
    return filtered_df


In [51]:
def clean_dataframe(df, data):
    df_clean = pd.DataFrame()
    for kinase in df['KINASE_psp'].unique():
        filtered_df = filter_dataframe(df, kinase)
        df_clean = pd.concat([df_clean, filtered_df])

    assert sum(df_clean['SUB_MOD_RSD_psp'].values == df_clean['Phosphosite_johnson'].values) == len(df_clean) # check if all phosphosites are the same

    # check if 'SITE_+/-7_AA_psp and 'SITE_+/-7_AA_johnson' are the same , otherwise print that they all are not the same
    # make SITE_+/-7_AA_johnson column all caps 
    df_clean['SITE_+/-7_AA_psp'] = df_clean['SITE_+/-7_AA_psp'].str.upper()
    if not sum(df_clean['SITE_+/-7_AA_psp'].values == df_clean['SITE_+/-7_AA_johnson'].values) == len(df_clean): 
        print(f" For {data} dataset , out of {len(df_clean)} rows, {sum(df_clean['SITE_+/-7_AA_psp'].values == df_clean['SITE_+/-7_AA_johnson'].values)} are the same")
    else:
        print(f"For {data} dataset, all SITE_+/-7_AA_psp and SITE_+/-7_AA_johnson are the same")
    
    return df_clean


In [52]:
data_list = ['ranked_1.csv', 'ranked_2.csv', 'ranked_3.csv']

In [55]:
for data in data_list:
    df = pd.read_csv('../data/merged_kinase_' + data)
    df_clean = clean_dataframe(df, data)
    df_clean.to_csv('../data/cleaned_' + data, index=False)

For ranked_1.csv dataset, all SITE_+/-7_AA_psp and SITE_+/-7_AA_johnson are the same
 For ranked_2.csv dataset , out of 322 rows, 321 are the same
 For ranked_3.csv dataset , out of 271 rows, 270 are the same
