## Overview of the final features

In [1]:
import pandas as pd
import numpy as np
import time

import os

import matplotlib.pyplot as plt
%matplotlib inline

_FOLDER_2 = "results/"
# _FOLDER = "/home/acq18mk/master/results/"

In [2]:
os.listdir(_FOLDER_2)

['test02_min10_restr_coef.csv',
 'filt2_1.png',
 'filt2_2.png',
 'merged_drug_profiles_cells_sigmoid4_123.csv',
 'drugs_with_pubchem_id.txt',
 'train08_min10_restr_coef.csv',
 'IC50_methods.png',
 'filt_fit_1_bounds.png',
 'test02_min10_restr.csv',
 'drug_features_with_pubchem_properties.csv',
 'drugs_with_no_pubchem_id.txt',
 'filt_Fig1b.png',
 'filt_Fig1a.png',
 'filt_fit_0.png',
 'drug_features_with_pubchem_properties_0.csv',
 'test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10',
 'target_target_pathway_df.csv',
 'filt_fit_0_bounds.png',
 'fit_filtered_drug_profiles_123.csv',
 'X_features_Targets.txt',
 'outlier_coef3.png',
 '.ipynb_checkpoints',
 'filt0.png',
 'drug_ids_50.txt',
 'outlier_coef1_1.png',
 'best_model_coef_4.csv',
 'filt3.png',
 'train08_min10_restr.csv',
 'filt_fit_2_bounds.png',
 'train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv',
 'outlier_coef1_2.png',
 'best_model_coef_1.csv',
 'filtered_drug_profiles_123.csv',
 'X_PubChem_properties.

In [3]:
# potential features for ML:
# - X_cancer_cell_lines - 1073
# - X_PubChem_properties - 26
# - X_targets - 229
# - X_target_pathway

with open(_FOLDER_2+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
print("Number of cancer cell lines features:", len(X_cancer_cell_lines))
# *****************************************

with open(_FOLDER_2+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
print("Number of PubChem drug properties:", len(X_PubChem_properties))
# *****************************************

with open(_FOLDER_2+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
print("Number of possible targets:", len(X_targets))
# *****************************************

with open(_FOLDER_2+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
print("Number of possible target pathways:", len(X_target_pathway))
# *****************************************

print("\n Maximum number of features:",len(X_cancer_cell_lines)+ len(X_PubChem_properties)+len(X_targets) + len(X_target_pathway))

Number of cancer cell lines features: 1073
Number of PubChem drug properties: 26
Number of possible targets: 229
Number of possible target pathways: 23

 Maximum number of features: 1351


### Columns for scaling

In [4]:
df = pd.read_csv(_FOLDER_2+"merged_fitted_sigmoid4_123_with_drugs_properties.csv").drop("Unnamed: 0", axis=1)

In [5]:
columns_to_normalise = []
all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]
for col in all_columns:
    if df[col].nunique()>2:
        columns_to_normalise.append(col)

len(columns_to_normalise), len(X_PubChem_properties +["MAX_CONC"]), columns_to_normalise      

(15,
 27,
 ['molecular_weight',
  '2bonds',
  'xlogp',
  'surface_area',
  'complexity',
  'h_bond_donor_count',
  'h_bond_acceptor_count',
  'rotatable_bond_count',
  'heavy_atom_count',
  'atom_stereo_count',
  'defined_atom_stereo_count',
  'undefined_atom_stereo_count',
  'bond_stereo_count',
  'covalent_unit_count',
  'MAX_CONC'])

In [6]:
with open(_FOLDER_2 + "columns_to_normalise.txt", 'w') as f:
    for s in columns_to_normalise:
        f.write(str(s) + '\n')

### Data Sparsity

In [7]:
df.shape

(2612, 1384)

In [8]:
difference = set(df.columns) - set(X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway)
len(difference), difference

(33,
 {'COSMIC_ID',
  'DRUG_ID',
  'Drug_Name',
  'H',
  'MAX_CONC',
  'Target',
  'Target_Pathway',
  'elements',
  'fd_num_0',
  'fd_num_1',
  'fd_num_2',
  'fd_num_3',
  'fd_num_4',
  'fd_num_5',
  'fd_num_6',
  'fd_num_7',
  'fd_num_8',
  'fd_num_9',
  'molecular_formula',
  'norm_cells_0',
  'norm_cells_1',
  'norm_cells_2',
  'norm_cells_3',
  'norm_cells_4',
  'norm_cells_5',
  'norm_cells_6',
  'norm_cells_7',
  'norm_cells_8',
  'norm_cells_9',
  'param_1',
  'param_2',
  'param_3',
  'param_4'})

In [9]:
# dataset1 -  only cancer cell lines features

columns = X_cancer_cell_lines
print("Dataset 1 - only cancer cell lines features:", df[columns].shape)
zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 1 - only cancer cell lines features: (2612, 1073)

Number of zero elements: 2,704,613.00
Number of all elements: 2,802,676.00
Sparsity of data in %: 96.501


In [10]:
# dataset2 -  cancer cell lines features + drug description

columns = X_cancer_cell_lines + X_targets + X_target_pathway +["MAX_CONC"]
print("Dataset 2 - cancer cell lines features + drug description:", df[columns].shape)
zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 2 - cancer cell lines features + drug description: (2612, 1326)

Number of zero elements: 3,355,710.00
Number of all elements: 3,463,512.00
Sparsity of data in %: 96.887


In [11]:
# dataset3 -  cancer cell lines features + PubChem drug properties
columns = X_cancer_cell_lines + X_PubChem_properties +["MAX_CONC"]

print("Dataset 3 - cancer cell lines features + PubChem drug properties:", df[columns].shape)

zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 3 - cancer cell lines features + PubChem drug properties: (2612, 1100)

Number of zero elements: 2,736,419.00
Number of all elements: 2,873,200.00
Sparsity of data in %: 95.239


In [12]:
# dataset4 -  cancer cell lines features + drug description + PubChem drug properties
columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]
print("Dataset 3 - cancer cell lines features + PubChem drug properties:", df[columns].shape)
zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 3 - cancer cell lines features + PubChem drug properties: (2612, 1352)

Number of zero elements: 3,387,516.00
Number of all elements: 3,531,424.00
Sparsity of data in %: 95.925
