# Preprocessing Example 
##### Author: Ryan Schildcrout

## Summary
##### This notebook shows an example of how to load and organize omics data from the CCLE. 


In [None]:
"""### Import libraries"""

import pandas as pd
import sklearn
import numpy as np
import scipy
import missingno
from adjustText import adjust_text
np.random.seed(123)
import pkg_resources
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [None]:
# Example of CCLE data import and preprocessing for histone PTMs

# CCLE omics can be obtained from the following website: https://depmap.org/portal/data_page/?tab=currentRelease
# Read in data
hist_ccle = pd.read_csv("./CCLE_GlobalChromatinProfiling_20181130.csv")

# Drop the BroadID column. This can be kept if you wish to merge with other datasets with similar cell line IDs
hist_ccle = hist_ccle.drop(['BroadID'], axis=1)

# Standardize cell line names and set them as the index
hist_ccle['CellLineName'] = hist_ccle['CellLineName'].str.upper()
hist_ccle = hist_ccle.set_index('CellLineName')

# Visualize data structure
print(hist_ccle.head(3))

In [None]:
# Pre-scaling 0 imputation example
hist_ccle.replace([np.inf, -np.inf], np.nan)
hist_ccle = hist_ccle.fillna(0)

In [None]:
# Z-score scaling 
scaler = StandardScaler()
print(scaler.fit(hist_ccle),flush=True)
hist_ccle_scaled = scaler.transform(hist_ccle)
hist_ccle_scaled = pd.DataFrame(hist_ccle_scaled, columns=hist_ccle.columns,index=hist_ccle.index)

In [None]:
# Example of KNN imputation 
# Count instances of missing values
total_nan = hist_ccle.isnull().sum().sum()

# Impute missing values
imputer = KNNImputer()
hist_imputed = imputer.fit_transform(hist_ccle)
hist_ccle_imputed = pd.DataFrame(hist_imputed,index=hist_ccle.index,columns=hist_ccle.columns)

# Ensure missing values have been replaced
total_nan = hist_ccle_imputed.isnull().sum().sum()

In [None]:
# ***Important - if using CV, scaling and KNN imputation should be performed within each CV fold.

# Save preprocessed data
hist_ccle.to_csv('./preprocessed_histone_PTMs_ccle.csv')