# Statistics for Missing Data

## Train Data (No NA in both columns):

### Essential Column - 44 NA
### Chromosome - 1 NA

## Total Incomplete records -> 45 -> 45/863 -> 5.21%



## Test Data (2 records with NA in both Columns):

### Essential Column - 17 NA
### Chromosome - 59 NA

## Total Incomplete records -> 74 -> 74/382 -> 19.37%

# Methods for dealing with NA's

## MissForest, uses a random forest (Out of Bag) to predict values for the NA values, going column by column

# NOTE NEED TO DISCUSS THAT WE NEED TO USE BOTH TRAIN AND TEST DATA AT SAME TIME WHEN REPLACING NA's SINCE SAME METHOD SHOULD BE USED FOR BOTH

In [118]:
# Install a pip package in the current Jupyter kernel
#import sys
#!{sys.executable} -m pip install missingpy

import os

import re

import pandas as pd

import numpy as np

from IPython.display import display

from missingpy import MissForest

from sklearn.model_selection import train_test_split


import matplotlib.pyplot as plt

os.chdir(r'D:\Documents\WATERLOO\3B\STAT 441\protein-localization')

# Get Column labels from Description file

description = pd.read_csv (r'D:\Documents\WATERLOO\3B\STAT 441\protein-localization\field_descriptions.txt', 
                           sep='\t', lineterminator='\n', header = None)

col_labels = description[0]

def clean_labels(label):
    label = re.sub(' +', ' ',label)
    label = re.sub(':', '',label)
    return "".join(label.rstrip().lstrip())

col_labels = col_labels.apply(clean_labels)



# Import Data
train = pd.read_csv (r'D:\Documents\WATERLOO\3B\STAT 441\protein-localization\train.csv', header=None,names = col_labels,na_values='?')

test = pd.read_csv (r'D:\Documents\WATERLOO\3B\STAT 441\protein-localization\test.csv', header=None,names = col_labels,na_values='?')


# Find Column numbers for the Interaction Columns
begin_interact = train.columns.get_loc("INTERACTING PROTEIN P239476 Type") 
end_interact = train.columns.get_loc("INTERACTING PROTEIN P235069 Corr")


# Generate seperate data sets for different situations\

# Without Interaction Columns
TEST_Data_1 = test.drop(test.iloc[:, begin_interact:end_interact+1], inplace = False, axis = 1)

TRAIN_Data_1 = train.drop(train.iloc[:, begin_interact:end_interact+1], inplace = False, axis = 1)


# Without Interaction Columns, last 15 solution columns
TEST_Data_2 =  TEST_Data_1.iloc[:, :-16]

TRAIN_Data_2 =  TRAIN_Data_1.iloc[:, :-16]


# Without Interaction Columns, last 15 solution columns,442 binary columns
TEST_Data_3 = TEST_Data_2.drop(
    TEST_Data_2.iloc[:, 2:444], inplace = False, axis = 1)

TRAIN_Data_3 = TRAIN_Data_2.drop(
    TRAIN_Data_2.iloc[:, 2:444], inplace = False, axis = 1)


In [2]:
# Encoding for dealing with Essential column
essential_encode = {"ESSENTIAL":     {"Essential": 1, "Non-Essential": 2,"Ambiguous-Essential": 3}}
reverse_essential_encode = {"ESSENTIAL":     {1: "Essential", 2: "Non-Essential",3: "Ambiguous-Essential"}}


#Create a data frame relabelling the categorical variables
X = TRAIN_Data_3.drop('Protein', axis=1)
X = X.replace(essential_encode)
X["ESSENTIAL"] = X.ESSENTIAL.astype('category')
X["Chromosome"] = X.Chromosome.astype('category')

#Find index of the categorical variables
cat_cols = [X.columns.get_loc(col) for col in X.select_dtypes(['category']).columns.tolist()]

#Set random_state of MissForest
imputer = MissForest(random_state=123)

#Use MissForest and the categorical variables to fill in NA values
X_imputed = imputer.fit_transform(X,cat_vars=cat_cols)



Complete_train= TRAIN_Data_1.copy()
Complete_train["ESSENTIAL"] = X_imputed[:,0].astype(int)
Complete_train = Complete_train.replace(reverse_essential_encode)
Complete_train["Chromosome"] =X_imputed[:,1].astype(int)
Complete_train = Complete_train.drop(Complete_train.columns[Complete_train.nunique() <= 1], axis=1)
Complete_wo_int_and_just_label = Complete_train.drop(Complete_train.iloc[:, 332:346], inplace = False, axis = 1)

######## PROTEIN INTERACTION ################

def clean_protien_interactions_vals(vals):
    vals = vals.replace("?", "")
    return  "".join(vals.rstrip('.'))

protein_interactions = pd.read_csv (r'D:\Documents\WATERLOO\3B\STAT 441\protein-localization\protein_interactions.csv', 
                                    header=None,
                                    names = ["Protein1","Protein2","int_type","strength"])

protein_interactions["strength"] = protein_interactions["strength"].apply(clean_protien_interactions_vals)

protein_interactions = protein_interactions.replace(r'^\s*$', np.NaN, regex=True)

unique_proteins = protein_interactions.Protein1.unique()


int_type1 = [None] * unique_proteins.size
int_type2 = [None] * unique_proteins.size
int_strength = [None] * unique_proteins.size

for x in range(unique_proteins.size): #len(unique_proteins.size)
    x_ints = protein_interactions[protein_interactions.Protein1==unique_proteins[x]]
    
    if pd.isnull(x_ints.strength.astype(float).mean()):
    
        int_strength[x] = 0
    else:
        int_strength[x] = x_ints.strength.astype(float).mean()
    if x_ints.int_type.unique().size >1 :
        int_type1[x] = 'Mixed'
        
        if x_ints.int_type.value_counts()[0]==x_ints.int_type.value_counts()[1]:
            int_type2[x] = 'Both'
            
        else:
            int_type2[x] = x_ints.int_type.value_counts().idxmax()
            
    else:
        int_type1[x] = 'Single'
        int_type2[x] = x_ints.int_type.value_counts().idxmax()
        
interaction_summary = pd.DataFrame(data={'Protein': unique_proteins, 
                                         'int_type1': int_type1,
                                         'int_type2':int_type2,
                                         'int_strength':int_strength})

interaction_summary

Iteration: 0
Iteration: 1
Iteration: 2


Unnamed: 0,Protein,int_type1,int_type2,int_strength
0,P238510,Mixed,Both,0.351358
1,P235550,Single,Physical,-0.023274
2,P235621,Single,Physical,0.143343
3,P235265,Single,Physical,0.212340
4,P234935,Single,Physical,0.015295
...,...,...,...,...
455,P235756,Single,Physical,0.938279
456,P234715,Single,Genetic,0.373344
457,P238567,Single,Physical,0.642485
458,P235620,Single,Physical,0.030146


In [57]:
Complete_train= pd.merge(Complete_wo_int_and_just_label, interaction_summary, on='Protein')

In [5]:
Complete_train

Unnamed: 0,Protein,ESSENTIAL,CLASS Actin related proteins,CLASS Adaptins,CLASS ATPases,CLASS Cyclins,CLASS GTP-binding proteins,CLASS GTP/GDP dissociation inhibitors (GDIs),CLASS GTP/GDP-exchange factors (GEFs),CLASS GTPase activating proteins (GAPs),...,NUM INTERACTING WITH FUNCTION IONIC HOMEOSTASIS,NUM INTERACTING WITH FUNCTION METABOLISM,NUM INTERACTING WITH FUNCTION PROTEIN DESTINATION,NUM INTERACTING WITH FUNCTION PROTEIN SYNTHESIS,NUM INTERACTING WITH FUNCTION TRANSCRIPTION,NUM INTERACTING WITH FUNCTION TRANSPORT FACILITATION,Label,int_type1,int_type2,int_strength
0,P234430,Non-Essential,No,No,No,No,No,No,No,No,...,0,0,1,0,0,1,0,Single,Physical,0.291315
1,P239467,Essential,No,No,No,No,No,No,No,No,...,0,0,0,0,7,0,0,Mixed,Physical,-0.063534
2,P234444,Non-Essential,No,No,No,No,No,No,No,No,...,0,0,0,0,3,0,0,Single,Physical,0.430887
3,P234445,Non-Essential,No,No,No,No,No,No,No,No,...,0,1,0,0,2,0,0,Single,Physical,0.748891
4,P234469,Non-Essential,No,No,No,No,No,No,No,No,...,0,0,0,0,0,0,0,Single,Genetic,-0.060037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,P235872,Essential,No,No,No,No,No,No,No,No,...,0,0,0,0,0,0,8,Single,Genetic,0.616340
456,P235924,Non-Essential,No,No,No,No,No,No,No,No,...,0,0,0,0,0,0,6,Single,Genetic,-0.065316
457,P235922,Non-Essential,No,No,No,No,No,No,No,No,...,0,0,1,0,0,1,2,Single,Genetic,0.000000
458,P235885,Non-Essential,No,No,No,No,No,No,No,No,...,0,1,1,0,0,0,1,Single,Physical,0.416236


In [80]:
Complete_train.to_csv(r'D:\Documents\WATERLOO\3B\STAT 441\protein-localization\TESTING_REPLACE.csv', index = False)