In [6]:
import pandas as pd
import numpy as np

In [144]:
def prep_data(path, mode = "train"):
    raw_data = pd.read_csv(path, header=None, 
                        keep_default_na=True, na_values='?',
                        na_filter=True, dtype={444 : str})
    # 460th to 2945th columns are encoded in protein_interactions.csv
    protein_interactions_cols = list(range(459, 2945))
    
    reduced_data = raw_data.drop(raw_data.columns[protein_interactions_cols], axis = 1)
    reduced_data = reduced_data.T.reset_index(drop=True).T

    cleaned_data = reduced_data
    cleaned_rows, cleaned_columns = cleaned_data.shape
    rename_dict = {0 : "protein", 1 : "essential", 444 : "chromosome",
                   cleaned_columns - 1 : "label", cleaned_columns - 2 : "localization"}
    if (mode != 'train'):
        rename_dict = {0 : "protein", 1 : "essential", 444 : "chromosome",
                        cleaned_columns - 1 : "localization"}
    cleaned_data = cleaned_data.rename(rename_dict, axis = 1)
    # 444 to 457 are continuous variables
    for i in range(444, 458):
        name = i + 1
        cleaned_data[name] = pd.Series(cleaned_data[name], dtype = float)
    if (mode == 'train'):
        X = cleaned_data.drop(["label", "protein"], axis=1)  
        Y = cleaned_data["label"]
    else:
        X = cleaned_data.drop(["protein"], axis=1) 
        Y = None
    X["protein"] = cleaned_data["protein"]
    X = X.drop(list(range(458, X.shape[1] - 1)), axis=1)
    X = X.drop('localization', axis=1)
    return (X, Y)

In [145]:
train_X, train_Y = prep_data("./train.csv", mode = 'train')
test_X = prep_data("./test.csv", mode = 'test')[0]

In [146]:
train_X = train_X.fillna({'essential' : 'Non-Essential', 'chromosome' : '4'})
test_X = test_X.fillna({'essential' : 'Non-Essential', 'chromosome' : '4'})

In [147]:
# Study potential relationship in protein_interactions
protein_interactions = pd.read_csv('./protein_interactions.csv', header=None, na_values = '?.',
                                   names = ('protein1', 'protein2', 'type', 'correlation'),
                                  dtype = {'correlation' : str})
# Trim the dot at the end of correlation and convert it to float
protein_interactions['correlation'] = protein_interactions['correlation'].apply(
                            lambda x : float(x[:-1]) 
                                       if str(x)[-1] == '.' 
                                        else float(x))

test_protein_interactions = pd.read_csv('./test_protein_interactions.csv', na_values = '?')

In [148]:
# Next: incorporate the protein_interaction relation back to the dataframe
train_X['label'] = train_Y
def join_protein_interaction(df, protein_interaction):
    pro1_train = pd.merge(df, protein_interaction, left_on = 'protein', right_on = 'protein1', how='left')
    pro2_train = train_X[['protein', 'essential', 'chromosome']]
    protein_train = pd.merge(pro1_train, pro2_train, 
                         left_on = 'protein2', right_on = 'protein', how = 'left',
                         suffixes = ['', '_interact'])
    return protein_train

In [149]:
# join protein_interactions
train_X_inter = join_protein_interaction(train_X, protein_interactions).drop(['protein1', 'protein2', 'protein_interact'], axis=1)
test_X_inter = join_protein_interaction(test_X, test_protein_interactions).drop(['protein1', 'protein2', 'protein_interact'], axis=1)

In [150]:
train_X_inter = train_X_inter[['protein', 'type', 'correlation', 'essential_interact', 'chromosome_interact']].fillna({'type' : 'No', 'correlation' : 0, 'essential_interact' : 'No', 'chromosome_interact' : 'No'})
test_X_inter = test_X_inter[['protein', 'type', 'correlation', 'essential_interact', 'chromosome_interact']].fillna({'type' : 'No', 'correlation' : 0, 'essential_interact' : 'No', 'chromosome_interact' : 'No'})

In [151]:
agg_mode = lambda x: pd.Series.mode(x)[0]
train_X_agg = train_X_inter.groupby('protein').agg(type=('type', agg_mode), correlation=('correlation', np.mean), essential_interact=('essential_interact', agg_mode), chromosome_interact=('chromosome_interact', agg_mode))
test_X_agg = test_X_inter.groupby('protein').agg(type=('type', agg_mode), correlation=('correlation', np.mean), essential_interact=('essential_interact', agg_mode), chromosome_interact=('chromosome_interact', agg_mode))

In [152]:
train_X_final = pd.merge(train_X, train_X_agg, left_on = 'protein', right_on = 'protein')
test_X_final = pd.merge(test_X, test_X_agg, left_on = 'protein', right_on = 'protein')

In [155]:
train_X_final.to_csv('./train_cleaned.csv', index=False)
test_X_final.to_csv('./test_cleaned.csv', index=False)