In [1]:
import pandas as pd

In [2]:
## reading train.csv
raw_train = pd.read_csv("./train.csv", header=None, 
                        keep_default_na=True, na_values='?',
                        na_filter=True)

In [3]:
# 862 x 2961
raw_rows, raw_columns = raw_train.shape

In [4]:
# 460th to 2954th columns are encoded in protein_interactions.csv
protein_interactions_cols = list(range(459, 2954))
# reduced down to 466 columns
reduced_train = raw_train.drop(raw_train.columns[protein_interactions_cols], axis = 1)
reduced_train = reduced_train.T.reset_index(drop=True).T

cleaned_train = reduced_train
cleaned_rows, cleaned_columns = cleaned_train.shape

In [5]:
# renaming some columns
cleaned_train = cleaned_train.rename({0 : "protein",
                      1 : "essential",
                      444 : "chromosome",
                      cleaned_columns - 1 : "label",
                      cleaned_columns - 2 : "localization"},
                      axis = 1)

In [66]:
# Study potential relationship in protein_interactions
protein_interactions = pd.read_csv('./protein_interactions.csv', header=None, na_values = '?.',
                                   names = ('protein1', 'protein2', 'type', 'correlation'),
                                  dtype = {'correlation' : str})

In [67]:
protein_interactions['correlation']

0       0.252653076.
1       0.709247942.
2      -0.001238791.
3       0.482255315.
4      -0.460856268.
           ...      
905     0.519144658.
906      0.21693664.
907    -0.128716897.
908     0.743719262.
909     -0.21443642.
Name: correlation, Length: 910, dtype: object

In [79]:
# Trim the dot at the end of correlation and convert it to float
protein_interactions['correlation'] = protein_interactions['correlation'].apply(
                            lambda x : float(x[:-1]) 
                                       if str(x)[-1] == '.' 
                                        else float(x))

In [80]:
# protein1, protein2 interaction is not communitive
display(protein_interactions.loc[protein_interactions.protein1 == "P239467"])
display(protein_interactions.loc[protein_interactions.protein2 == "P239467"])

Unnamed: 0,protein1,protein2,type,correlation
110,P239467,P238704,Physical,0.723051
739,P239467,P235639,Physical,-0.483302
780,P239467,P235701,Genetic,-0.430352


Unnamed: 0,protein1,protein2,type,correlation
0,P238510,P239467,Genetic,0.252653
1,P235550,P239467,Physical,0.709248
2,P235621,P239467,Physical,-0.001239
3,P235265,P239467,Physical,0.482255


In [81]:
# Next: incorporate the protein_interaction relation back to the dataframe

In [82]:
# join protein_interactions with cleaned_train
pro1_train = pd.merge(cleaned_train, protein_interactions, left_on = 'protein', right_on = 'protein1', how='left')

In [83]:
pro2_cleaned_train = cleaned_train[['protein', 'essential', 'chromosome', 'localization', 'label']]

In [84]:
protein_train = pd.merge(pro1_train, pro2_cleaned_train, 
                         left_on = 'protein2', right_on = 'protein', how = 'left',
                         suffixes = ['', '_interact'])

In [87]:
# change correlation to categorical variable
def correlation_classifer(cor):
    cor = float(cor)
    if (pd.isna(cor)):
        return "Missing"
    if (cor >= 0):
        if (cor >= 0.5):
            return "Strong Positive"
        else:
            return "Weak Positive"
    else:
        if (cor <= -0.5):
            return "Strong Negative"
        else:
            return "Weak Negative"
protein_train["correlation"] = protein_train["correlation"].apply(correlation_classifer)

In [99]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
XY_train = protein_train.drop(["protein", "protein1", "protein2", "protein_interact"], axis=1)
XY_train = XY_train.apply(LabelEncoder().fit_transform)
X_train = XY_train.drop(['label'], axis = 1)
Y_train = XY_train.label
# define feature selection
fs = SelectKBest(score_func=chi2, k='all')
# apply feature selection
X_selected = fs.fit_transform(X_train, Y_train)
print(X_selected.shape)

(1312, 470)


In [100]:
for i in range(len(fs.scores_)):
    print('Feature %d: %f' % (i, fs.scores_[i]))

Feature 0: 12.540962
Feature 1: 53.053097
Feature 2: nan
Feature 3: 68.052632
Feature 4: 39.511286
Feature 5: 39.111111
Feature 6: nan
Feature 7: 68.981064
Feature 8: 7.111111
Feature 9: 105.512645
Feature 10: 18.896000
Feature 11: 13.627986
Feature 12: nan
Feature 13: 45.924766
Feature 14: 61.926690
Feature 15: 1.238908
Feature 16: 18.896000
Feature 17: nan
Feature 18: 34.580312
Feature 19: 14.189606
Feature 20: 15.630146
Feature 21: 30.617678
Feature 22: 77.393049
Feature 23: 42.442478
Feature 24: 10.570682
Feature 25: 16.972603
Feature 26: 16.972603
Feature 27: 21.333333
Feature 28: 8.672355
Feature 29: 7.111111
Feature 30: 3.555556
Feature 31: 96.819109
Feature 32: 14.222222
Feature 33: 17.777778
Feature 34: 17.615300
Feature 35: nan
Feature 36: 17.777778
Feature 37: 24.136317
Feature 38: 519.029592
Feature 39: 86.466667
Feature 40: 16.972603
Feature 41: 35.555556
Feature 42: 3.555556
Feature 43: 42.666667
Feature 44: 47.870381
Feature 45: nan
Feature 46: 280.217391
Feature 47: nan