In [81]:
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix
import anndata as ad

## preprocess both files supplied in GEO in bash

## read in sample/cell info

In [4]:
pts = pd.read_table("/data/rna_rep_learning/sadefeldman/GSE120575_processed_metadata.txt")

  """Entry point for launching an IPython kernel.


In [7]:
pts['characteristics: therapy'].value_counts()

anti-PD1          11653
anti-CTLA4+PD1     4121
anti-CTLA4          517
Name: characteristics: therapy, dtype: int64

#### drop empty columns

In [23]:
#check that all are empty
(~pts.loc[:,'molecule':'Unnamed: 34'].isna()).sum() 

In [23]:
# drop
pts.drop(columns=pts.columns[7:], inplace=True)

In [34]:
len(pd.unique(pts.title))

16291

## clean up column names, drop meaningless columns

In [92]:
pts.drop(columns=["Sample name","organism","source name"], inplace=True)

In [94]:
pts.columns

Index(['title',
       'characteristics: patinet ID (Pre=baseline; Post= on treatment)',
       'characteristics: response', 'characteristics: therapy'],
      dtype='object')

In [95]:
pts.rename(columns={'title':'cellid',
       'characteristics: patinet ID (Pre=baseline; Post= on treatment)':"pid",
       'characteristics: response':"response", 'characteristics: therapy':"therapy"}, inplace=True)

In [96]:
pts.head()

Unnamed: 0,cellid,pid,response,therapy
0,A10_P3_M11,Pre_P1,Responder,anti-CTLA4
1,A11_P1_M11,Pre_P1,Responder,anti-CTLA4
2,A11_P3_M11,Pre_P1,Responder,anti-CTLA4
3,A11_P4_M11,Pre_P1,Responder,anti-CTLA4
4,A12_P3_M11,Pre_P1,Responder,anti-CTLA4


## read in gene expression data

In [60]:
adata_orig = sc.read_csv("/data/rna_rep_learning/sadefeldman/GSE120575_processed_TPM.txt", delimiter="\t", first_column_names=True)

In [86]:
# need to tranpose adata
adata = ad.AnnData(np.transpose(adata_orig.X), obs = adata_orig.var.copy(), var=adata_orig.obs.copy())

del adata_orig

In [100]:
print(len(pd.merge(pts, adata.obs, right_index=True, left_on="cellid")))
print(len(pts))
print(len(adata.obs))

##all same length, so ready to merge

16291
16291
16291


In [103]:
adata.obs = pd.merge(pts, adata.obs, right_index=True, left_on="cellid").set_index("cellid")

#### they use about 20,000 protein coding genes in their analysis (see table S6). what are the other 30K genes present?

## convert adata to sparse format & save adata object

In [108]:
adata.X = csr_matrix(adata.X)
adata.write("/data/rna_rep_learning/sadefeldman/processed_adata_sparse.h5ad")