# Import libraries 

In [None]:
# import necessary libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load data

In [None]:
df = pd.read_csv('../data/41586_2022_5575_MOESM5_ESM.csv', delimiter=';')


In [None]:
df

# Data pre processing

In [None]:
# columns needed 
columns_needed = ['Uniprot Primary Accession', 'Protein','Phosphosite','SITE_+/-7_AA' ]

Select the rank columns 

In [None]:
rank_columns = [x for x in df.columns if 'rank' in x]
rank_columns

Cleaned up data 

In [None]:
data = df[columns_needed + rank_columns]
data.head()

In [None]:
data

# Creating Ranked Kinase DataFrame 

In [None]:
def create_kinase_ranked(number: int, data: pd.DataFrame = data, 
                         columns_needed: list = columns_needed, 
                         rank_columns: list = rank_columns) -> pd.DataFrame:
    """
    Creates a DataFrame of kinases ranked based on a rank number.

    Parameters:
    number (int): The rank number to filter the data.
    data (DataFrame): The original DataFrame containing kinase data.
    columns_needed (list): List of column names to be included in the final DataFrame.
    rank_columns (list): List of column names in the original DataFrame that contain ranking information.

    Returns:
    DataFrame: A new DataFrame containing kinases of the specified rank, with columns as specified in columns_needed.
    """
    kinase_ranked = pd.DataFrame()
    for rank in rank_columns: 
        new_df = data[data[rank] == number]
        new_df[f'ranked_{number}'] = rank
        kinase_ranked = pd.concat([kinase_ranked, new_df[columns_needed+[f'ranked_{number}']].reset_index(drop=True)], axis=0)
    return kinase_ranked




## Generating and Saving Ranked Kinase Data to CSV Files

In [None]:
for rank in [1,2,3]: 
    kinase_ranked = create_kinase_ranked(rank)
    # create a csv file in the data folder 
    kinase_ranked.to_csv(f'../data/kinase_ranked_{rank}.csv', index=False)