In [1]:

import os
import sys
import logging
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn
import numpy as np
import pandas as pd
import pylab
import sklearn.preprocessing

import scgenome
import scgenome.db.qc
import scgenome.cnplot
import scgenome.cnfilter
import scgenome.cnclones

LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO)

sample_ids = [
    'SA1090',
    'SA921',
    'SA922',
]



Download zenodo data using wget:


In [None]:

!mkdir zenodo_data/
!wget https://zenodo.org/record/3445364/files/ov2295_cell_cn.csv.gz?download=1 -O zenodo_data/ov2295_cell_cn.csv.gz
!wget https://zenodo.org/record/3445364/files/ov2295_cell_metrics.csv.gz?download=1 -O zenodo_data/ov2295_cell_metrics.csv.gz



# Load CN data


In [4]:

cn_data = pd.read_csv(
    'zenodo_data/ov2295_cell_cn.csv.gz',
    dtype={
        'cell_id': 'category',
        'sample_id': 'category',
        'library_id': 'category',
        'chr': 'category',
    })

metrics_data = pd.read_csv(
    'zenodo_data/ov2295_cell_metrics.csv.gz',
    dtype={
        'cell_id': 'category',
        'sample_id': 'category',
        'library_id': 'category',
    })

scgenome.utils.union_categories([cn_data, metrics_data])


In [10]:

cn_data


Unnamed: 0,cell_id,sample_id,library_id,chr,start,end,reads,copy,state,gc
0,SA922-A90554B-R34-C70,SA922,A90554B,1,1,500000,13,,6,0.5
1,SA922-A90554B-R34-C70,SA922,A90554B,1,500001,1000000,442,,6,0.5
2,SA922-A90554B-R34-C70,SA922,A90554B,1,1000001,1500000,461,6.672340,6,0.5
3,SA922-A90554B-R34-C70,SA922,A90554B,1,1500001,2000000,478,5.211916,6,0.5
4,SA922-A90554B-R34-C70,SA922,A90554B,1,2000001,2500000,594,8.384862,6,0.5
...,...,...,...,...,...,...,...,...,...,...
12200991,SA1090-A96213A-R29-C58,SA1090,A96213A,Y,57000001,57500000,0,,0,0.5
12200992,SA1090-A96213A-R29-C58,SA1090,A96213A,Y,57500001,58000000,0,,0,0.5
12200993,SA1090-A96213A-R29-C58,SA1090,A96213A,Y,58000001,58500000,0,,0,0.5
12200994,SA1090-A96213A-R29-C58,SA1090,A96213A,Y,58500001,59000000,81,,0,0.5


In [11]:

metrics_data.head()


Unnamed: 0,cell_id,unpaired_mapped_reads,paired_mapped_reads,unpaired_duplicate_reads,paired_duplicate_reads,unmapped_reads,percent_duplicate_reads,estimated_library_size,total_reads,total_mapped_reads,...,mean_state_mads,mean_state_vars,mad_neutral_state,breakpoints,mean_copy,state_mode,log_likelihood,true_multiplier,quality,order
0,SA922-A90554B-R34-C70,70068,2462814,19418,711792,352397,0.290496,3404765.0,5348094,4995697,...,0.076997,0.623423,0.04115853,150,5.469529,7,-6762.713273,5.941909,0.482,139.0
1,SA922-A90554B-R28-C09,49767,2723310,19848,701345,588432,0.260503,4353052.0,6084820,5496388,...,0.070188,0.361665,0.04132074,93,4.132577,5,-4189.124414,4.940759,0.88,572.0
2,SA922-A90554B-R28-C03,1113,5605,198,70,1261949,0.028078,235789.0,1274272,12323,...,0.083333,0.832407,3.672209e-08,2136,3.294061,2,1333.176822,2.0,0.0,64.0
3,SA922-A90554B-R28-C07,113421,5554326,44742,1636879,905778,0.297332,7488663.0,12127852,11222074,...,0.133991,0.224699,0.08303813,93,4.111607,5,-2830.590149,2.013123,0.984,536.0
4,SA922-A90554B-R28-C05,1054,20043,25,608,567186,0.031915,343193.0,608326,41140,...,0.166667,2.073303,4.146547e-08,3099,6.902378,11,-10197.616506,1.972223,0.0,185.0


In [5]:

# GC was not included
cn_data['gc'] = 0.5



Create a matrix of `copy` values with rows as regions, cells as columns


In [12]:

cn = (
    cn_data
        .set_index(['chr', 'start', 'end', 'cell_id'])['copy']
        .unstack(level='cell_id').fillna(0).transpose()
)

cn


chr,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,...,11,11,11,11,11,11,11,11,11,11
start,1,500001,1000001,1500001,2000001,2500001,3000001,3500001,4000001,4500001,...,130500001,131000001,131500001,132000001,132500001,133000001,133500001,134000001,134500001,135000001
end,500000,1000000,1500000,2000000,2500000,3000000,3500000,4000000,4500000,5000000,...,131000000,131500000,132000000,132500000,133000000,133500000,134000000,134500000,135000000,135500000
cell_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
SA1090-A96213A-R34-C39,0.0,0.0,0.0,0.0,0.0,0.0,0.039154,0.030606,0.006001,0.000000,...,2.895056,3.679613,3.259979,3.637160,3.608536,3.100619,3.330293,2.827470,0.0,0.0
SA922-A90554B-R27-C51,0.0,0.0,0.0,0.0,0.0,0.0,0.095686,0.055453,0.000000,0.058369,...,2.871006,1.507647,1.852678,1.661813,1.638240,1.772827,1.633959,1.608174,0.0,0.0
SA922-A90554B-R31-C24,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,3.008929,6.017857,3.008929,15.044643,9.026786,3.008929,6.017857,3.008929,0.0,0.0
SA921-A90554A-R14-C14,0.0,0.0,0.0,0.0,0.0,0.0,0.081385,0.060823,0.009538,0.011985,...,4.218101,4.075016,4.058048,3.913305,4.097246,4.313535,3.940552,4.302854,0.0,0.0
SA921-A90554A-R10-C28,0.0,0.0,0.0,0.0,0.0,0.0,0.035752,0.074982,0.016626,0.019092,...,4.329614,3.980703,3.939489,4.515300,4.141756,4.212274,3.583691,4.263613,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SA1090-A96213A-R27-C15,0.0,0.0,0.0,0.0,0.0,0.0,0.034645,0.031128,0.009789,0.009939,...,1.940525,1.955604,2.238605,2.249047,1.972062,1.706605,2.164498,1.792833,0.0,0.0
SA921-A90554A-R05-C17,0.0,0.0,0.0,0.0,0.0,0.0,0.285950,0.000000,0.000000,0.000000,...,2.187677,4.806511,2.897251,2.434886,4.147951,6.534556,3.392933,2.960908,0.0,0.0
SA922-A90554B-R26-C70,0.0,0.0,0.0,0.0,0.0,0.0,0.060531,0.036403,0.012100,0.012109,...,2.066546,2.252682,2.302790,1.998031,2.643217,2.419191,1.888181,2.299953,0.0,0.0
SA922-A90554B-R23-C57,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,1.000000,4.000000,2.000000,4.000000,2.000000,4.000000,2.000000,2.000000,0.0,0.0
