# Survival Analysis of PBTA miRNA data
### Author: Shehbeel Arif
### Affiliation: Center for Data-driven Discovery in Biomedicine (D3b), The Children's Hospital of Philadelphia
### Contact: arifs2@chop.edu
### Goal: To find prognostic miRNA that correlate with overall survival (OS) and progression-free survival (PFS)

In [1]:
# Data Handling Libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Survival Libraries
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

In [53]:
# Load dataset
df = pd.read_csv("/Users/arifs2/OneDrive - Children's Hospital of Philadelphia/OpenPBTA miRNA Projects/datasets/pbta_mirna_clinical_data.csv")
df

Unnamed: 0,sample_id,Kids_First_Biospecimen_ID,aliquot_id,Kids_First_Participant_ID,experimental_strategy,sample_type,composition,tumor_descriptor,primary_site,reported_gender,...,miR.944,miR.95.3p,miR.95.5p,miR.9.5p,miR.96.3p,miR.96.5p,miR.98.3p,miR.99a.5p,miR.99b.3p,miR.99b.5p
0,7316-100,BS_BHR08WGW,601598,PT_6TZR2DH1,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Suprasellar/Hypothalamic/Pituitary,Female,...,79,687,7,30505,16,10291,12,95557,211,6040
1,7316-101,BS_QV51J756,588338,PT_CWD717Q0,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Temporal Lobe,Male,...,2,376,1,8763,1,9,0,91472,552,16924
2,7316-111,BS_QXKRN6CR,470423,PT_RM5S859Q,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Occipital Lobe,Female,...,1,309,3,209761,3,55,9,140897,606,15929
3,7316-114,BS_23QW0BBA,577714,PT_3X3MF8ZD,RNA-Seq,Tumor,Solid Tissue,Second Malignancy,Brain Stem-Medulla;Brain Stem- Pons;Spinal Cor...,Female,...,1,1307,1,60231,0,93,0,22507,56,1979
4,7316-117,BS_DPF1CX0G,570116,PT_ZVV78QP5,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,Male,...,0,126,0,101800,0,51,4,20144,74,3000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,7316-944,BS_9DFBHHB5,549611,PT_KMHGNCNR,RNA-Seq,Tumor,Solid Tissue,Recurrence,Cerebellum/Posterior Fossa;Optic Pathway;Other...,Male,...,1,148,1,83036,0,12,2,19046,105,3040
251,7316-946,BS_AP0SZ364,588444,PT_PGVQ4XRZ,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,Male,...,1,474,0,153845,1,43,6,50332,313,8653
252,7316-949,BS_86Q14TZG,734543,PT_2WA5PM32,RNA-Seq,Tumor,Solid Tissue,Recurrence,Suprasellar/Hypothalamic/Pituitary,Male,...,84,253,3,74435,17,19924,3,109317,142,5116
253,7316-954,BS_5YJEPPTP,549590,PT_RFF7MKTC,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,Female,...,1,136,0,80083,0,47,2,33554,116,3764


In [55]:
# Drop NAs present in OS_survival column
df = df.dropna(subset=['OS_days', 'OS_status_boolean'])

### Split into Training and Testing Set

In [56]:
X = df.iloc[:,38:]
y = df.iloc[:,35:37]
X.head(5)

Unnamed: 0,let.7a.2.3p,let.7a.3p,let.7a.5p,let.7b.5p,let.7c.3p,let.7c.5p,let.7d.3p,let.7d.5p,let.7e.3p,let.7e.5p,...,miR.944,miR.95.3p,miR.95.5p,miR.9.5p,miR.96.3p,miR.96.5p,miR.98.3p,miR.99a.5p,miR.99b.3p,miR.99b.5p
0,21,194,85224,54773,142,77582,1511,21696,165,10168,...,79,687,7,30505,16,10291,12,95557,211,6040
1,90,4,87665,92903,25,58888,3376,23006,390,17655,...,2,376,1,8763,1,9,0,91472,552,16924
2,76,141,184202,125974,364,157243,1609,26639,271,14739,...,1,309,3,209761,3,55,9,140897,606,15929
3,10,3,70739,29303,23,34017,644,6348,50,3781,...,1,1307,1,60231,0,93,0,22507,56,1979
4,27,13,51619,26170,27,34020,894,8469,83,4982,...,0,126,0,101800,0,51,4,20144,74,3000


### Normalize Data

In [59]:
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
Xs = pd.DataFrame(Xs)
Xs


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2073,2074,2075,2076,2077,2078,2079,2080,2081,2082
0,-0.625522,3.086510,-0.213545,0.092322,0.241995,0.426750,-0.114647,0.637049,-0.188672,0.065994,...,4.125782,0.250736,0.827944,-0.789401,0.231592,0.232582,1.418094,0.740787,-0.179988,-0.309450
1,0.634875,-1.010894,-0.175167,1.025633,-0.702163,-0.048959,1.557518,0.776080,0.732658,1.380742,...,-0.234432,-0.242103,-0.202692,-1.025285,-0.177678,-0.165090,-0.872849,0.657648,0.218350,1.628858
2,0.379142,1.943550,1.342623,1.835115,2.033475,2.453894,-0.026780,1.161650,0.245377,0.868681,...,-0.291058,-0.348277,0.140854,1.155385,-0.123109,-0.163310,0.845358,1.663553,0.281430,1.451660
3,-0.826455,-1.032460,-0.441284,-0.531110,-0.718303,-0.681854,-0.892002,-0.991836,-0.659574,-1.055590,...,-0.291058,1.233244,-0.202692,-0.466898,-0.204963,-0.161841,-0.872849,-0.745938,-0.361051,-1.032665
4,-0.515923,-0.816807,-0.741895,-0.607796,-0.686024,-0.681778,-0.667851,-0.766734,-0.524446,-0.844689,...,-0.347685,-0.638275,-0.374464,-0.015907,-0.204963,-0.163465,-0.109202,-0.794030,-0.340024,-0.850838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,-0.497656,-0.148283,-0.674824,-0.504013,-0.686024,-0.581211,-0.802342,-0.662302,-0.684143,-0.666978,...,-0.291058,-0.603412,-0.202692,-0.219481,-0.204963,-0.164974,-0.491026,-0.816377,-0.303812,-0.843714
246,0.233009,0.304588,-0.403424,-0.364763,-0.274468,-0.330938,-0.304727,0.145667,-0.000311,-0.318930,...,-0.291058,-0.086803,-0.374464,0.548741,-0.177678,-0.163775,0.272622,-0.179639,-0.060837,0.155893
247,-0.552456,1.124069,0.257245,0.202199,0.330762,0.720843,-0.103888,0.549173,-0.311516,-0.287146,...,4.408913,-0.437020,0.140854,-0.312795,0.258877,0.605153,-0.300114,1.020832,-0.260590,-0.474004
248,-0.515923,-0.601154,-0.479363,-0.373232,-0.653745,-0.371934,-0.984352,-0.715473,-0.557204,-0.612716,...,-0.291058,-0.622428,-0.374464,-0.251519,-0.204963,-0.163620,-0.491026,-0.521108,-0.290962,-0.714778


In [60]:
# Convert y (OS_surival and boolean censor) to structured array, which is required by scikit-learn survival function
y = y.to_records(index=False, column_dtypes={'OS_days':'f8','OS_status_boolean': 'bool'})
y

rec.array([(False, 3.3610e+03), (False, 5.3600e+02), (False, 1.7010e+03),
           ( True, 8.2810e+03), (False, 1.1450e+03), (False, 1.1560e+03),
           (False, 2.3160e+03), (False, 2.8660e+03), (False, 2.3010e+03),
           (False, 1.8330e+03), (False, 3.4830e+03), (False, 2.5960e+03),
           ( True, 2.5960e+03), ( True, 2.5960e+03), (False, 4.8300e+02),
           (False, 7.3990e+03), ( True, 8.9000e+01), ( True, 2.7980e+03),
           (False, 4.7710e+03), (False, 2.9190e+03), (False, 3.7900e+02),
           (False, 6.9760e+03), (False, 3.3730e+03), (False, 1.3960e+03),
           (False, 1.1000e+03), (False, 7.1880e+03), ( True, 1.3200e+02),
           ( True, 1.3200e+02), ( True, 1.3200e+02), (False, 1.8350e+03),
           (False, 1.4820e+03), ( True, 1.7610e+03), (False, 1.6860e+03),
           (False, 2.2620e+03), ( True, 2.2500e+02), ( True, 2.2500e+02),
           ( True, 2.2500e+02), (False, 1.7560e+03), ( True, 6.3800e+02),
           ( True, 6.3800e+02), ( True

### Multivariate Survival Model

In [61]:
#from sksurv.linear_model import CoxPHSurvivalAnalysis
estimator = CoxPHSurvivalAnalysis()
estimator.fit(X, y)

  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, optimizer.gradient,
  risk_set += np.exp(xw[k])
  delta = solve(optimizer.hessian, optimizer.gradient,
  delta = solve(optimizer.hessian, op

ValueError: search direction contains NaN or infinite values