In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# # Code to generate test_protein_interactions.csv
# test_raw = pd.read_csv('./test.csv', header=None, 
#                         keep_default_na=True, na_values='?',
#                         na_filter=True)
# field_desc = pd.read_csv('./field_descriptions.txt', header=None, delimiter='\n')
# test_interactions = test_raw[list(range(459, 2945))]
# interaction_desc = []
# for i in field_desc.iloc[list(range(459, 2945))][0]:
#     interaction_desc.append(i[0 : 27].split(' ')[-1])
# test_protein_interactions = []
# for row in test_interactions.T:
#     for col in test_interactions:
#         p_type = test_interactions.loc[row, col]
#         if (col % 2 == 1) and p_type != 'No':
#             protein1 = test_raw.iloc[row, 0]
#             protein2 = interaction_desc[col - 460]
#             correlation = test_interactions.loc[row, col + 1]
#             record = [protein1, protein2, p_type, correlation]
#             test_protein_interactions.append(record)       
# test_protein_interactions_df = pd.DataFrame(test_protein_interactions, columns=['protein1', 'protein2', 'type', 'correlation'])
# test_protein_interactions_df.to_csv('./test_protein_interactions.csv', na_rep = '?', index=False)

In [3]:
def prep_data(path, mode = "train"):
    raw_data = pd.read_csv(path, header=None, 
                        keep_default_na=True, na_values='?',
                        na_filter=True)
    # 460th to 2945th columns are encoded in protein_interactions.csv
    protein_interactions_cols = list(range(459, 2945))
    
    reduced_data = raw_data.drop(raw_data.columns[protein_interactions_cols], axis = 1)
    reduced_data = reduced_data.T.reset_index(drop=True).T

    cleaned_data = reduced_data
    cleaned_rows, cleaned_columns = cleaned_data.shape
    rename_dict = {0 : "protein", 1 : "essential", 444 : "chromosome",
                   cleaned_columns - 1 : "label", cleaned_columns - 2 : "localization"}
    if (mode != 'train'):
        rename_dict = {0 : "protein", 1 : "essential", 444 : "chromosome",
                        cleaned_columns - 1 : "localization"}
    cleaned_data = cleaned_data.rename(rename_dict, axis = 1)
    # 444 to 457 are continuous variables
    for i in range(444, 458):
        name = i + 1
        cleaned_data[name] = pd.Series(cleaned_data[name], dtype = float)
    if (mode == 'train'):
        X = cleaned_data.drop(["label", "protein"], axis=1)  
        Y = cleaned_data["label"]
    else:
        X = cleaned_data.drop(["protein"], axis=1) 
        Y = None
    X = X.apply(lambda c : pd.Series(LabelEncoder().fit_transform(c[c.notnull()]) if c.dtype != float else c , index=c[c.notnull()].index))
    X["protein"] = cleaned_data["protein"]
    return (X, Y)

In [4]:
train_X, train_Y = prep_data("./train.csv", mode = 'train')
test_X = prep_data("./test.csv", mode = 'test')[0]

In [5]:
categorical_indices = [item for item in list(range(0, train_X.shape[1] - 1)) if item not in range(444, 458)]

In [6]:
# Fill missing values
import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from missingpy import MissForest
imputer = MissForest()
X_imputed = pd.DataFrame(imputer.fit_transform(train_X.drop(["protein"], axis=1), cat_vars=categorical_indices))
train_X["essential"] = X_imputed[0].astype(int)
train_X["chromosome"] = X_imputed[444].astype(int)

Iteration: 0
Iteration: 1
Iteration: 2


In [7]:
# Study potential relationship in protein_interactions
protein_interactions = pd.read_csv('./protein_interactions.csv', header=None, na_values = '?.',
                                   names = ('protein1', 'protein2', 'type', 'correlation'),
                                  dtype = {'correlation' : str})
# Trim the dot at the end of correlation and convert it to float
protein_interactions['correlation'] = protein_interactions['correlation'].apply(
                            lambda x : float(x[:-1]) 
                                       if str(x)[-1] == '.' 
                                        else float(x))

test_protein_interactions = pd.read_csv('./test_protein_interactions.csv', na_values = '?')

In [8]:
# protein1, protein2 interaction is not communitive
# display(protein_interactions.loc[protein_interactions.protein1 == "P239467"])
# display(protein_interactions.loc[protein_interactions.protein2 == "P239467"])

In [9]:
# Next: incorporate the protein_interaction relation back to the dataframe
train_X['label'] = train_Y
def join_protein_interaction(df, protein_interaction):
    pro1_train = pd.merge(df, protein_interaction, left_on = 'protein', right_on = 'protein1', how='left')
    pro2_train = train_X[['protein', 'essential', 'chromosome', 'localization']]
    protein_train = pd.merge(pro1_train, pro2_train, 
                         left_on = 'protein2', right_on = 'protein', how = 'left',
                         suffixes = ['', '_interact'])
    return protein_train

In [10]:
# join protein_interactions
train_X_inter = join_protein_interaction(train_X, protein_interactions)
test_X_inter = join_protein_interaction(test_X, test_protein_interactions)

In [11]:
# protein2 can be in train or test data set 

In [181]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_train = train_X_inter.drop(["label", "protein", "protein1", "protein2", "protein_interact"], axis=1)
X_train["type"] = LabelEncoder().fit_transform(X_train["type"]).astype(int)
X_train["correlation"] = X_train["correlation"].fillna(0)
Y_train = LabelEncoder().fit_transform(train_X_inter["label"]).astype(int)
X_test = test_X_inter.drop(["protein", "protein1", "protein2", "protein_interact"], axis=1)
X_test["type"] = LabelEncoder().fit_transform(X_test["type"]).astype(int)
X_test["correlation"] = X_test["correlation"].fillna(0)
categorical_indices = [item for item in list(range(0, X_train.shape[1])) if item not in range(444, 458) and item != 474]


In [80]:
X_train['data_t'] = 'train'
X_test['data_t'] = 'test'
train_test_X = pd.concat([X_train, X_test])

In [81]:
train_test_X_imputed = pd.DataFrame(imputer.fit_transform(train_test_X.drop('data_t', axis=1), cat_vars=categorical_indices),
                                   columns=train_test_X.drop('data_t', axis=1).columns)


Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [82]:
train_test_X_imputed['data_t'] = train_test_X['data_t'].values
train_imputed = train_test_X_imputed[train_test_X_imputed['data_t'] == 'train'].drop('data_t', axis=1)
test_imputed = train_test_X_imputed[train_test_X_imputed['data_t'] == 'test'].drop('data_t', axis=1)

In [83]:
# Feature selection using ExtraTreesClassifier

In [84]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(train_imputed, Y_train)
model = SelectFromModel(clf, prefit=True)

In [177]:
feature_selected = np.where(model.get_support())
train_selected = pd.DataFrame(model.transform(train_imputed), columns=train_imputed.columns[feature_selected])
test_selected = test_imputed.loc[:,train_imputed.columns[feature_selected]]
train_selected.shape

(1312, 53)

In [182]:
from sklearn.model_selection import train_test_split
train_selected['label'] = Y_train
train_selected['protein'] = train_X_inter['protein'].values
train, valid = train_test_split(train_selected, test_size=0.2)

In [183]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
X_train = train.drop(['label', 'protein'], axis=1)
Y_train = train['label']
X_valid = valid.drop(['label', 'protein'], axis=1)
Y_valid = valid['label']
xg_reg = xgb.XGBClassifier(objective = 'multi:softprob', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train, Y_train)
valid_preds = xg_reg.predict(X_valid)



In [185]:
(valid_preds == Y_valid).sum() / len(preds)

0.8973384030418251

In [188]:
valid_preds = pd.DataFrame(valid_preds, columns = ['label'])
valid_preds['protein'] = valid['protein'].values

In [132]:
test_preds = xg_reg.predict(test_selected)