# ScanPy intro

following video by Sanbomics: https://www.youtube.com/watch?v=uvyG9yLuNSE&list=PPSV  
Data from GSE171524

In [1]:
# -- if not installed through terminal run the following
# %pip install scanpy

import scanpy as sc
import os, sys
from IPython.display import display
os.chdir(os.path.expanduser('..'))

In [2]:
data = sc.read_csv('data/GSM5226574_C51ctr_raw_counts.csv').T #--- genes should be cols and cells rows
data

AnnData object with n_obs × n_vars = 6099 × 34546

In [3]:
# observations: cells
print('cells')
display(data.obs.head())
# variables: genes
print('genes')
display(data.var.head())
print('The counts are represented in data.X as a numpy array; dimensions: ',data.X.shape)


cells


TAGGTACCATGGCCAC-1_1
ATTCACTGTAACAGGC-1_1
TAACTTCCAACCACGC-1_1
TTGGGTACACGACAAG-1_1
AGGCCACAGAGTCACG-1_1


genes


AL627309.1
AL627309.5
AL627309.4
AL669831.2
LINC01409


The counts are represented in data.X as a numpy array; dimensions:  (6099, 34546)


## Preprocessing

### Doublet removal

not necessary but recommendeed in single cell, because sometimes 2+ cells end up in the same dropplet

### Filtering genes and cells

starting by labeling

**Mitochondrial genes**: in humans usually start with MT- (not always but in general - they are all 13 genes only so cv we can have a list and remove it)

In [4]:
display(data.var[data.var.index.str.startswith('MT-')].head())
# to add it as a boolean value (is mitochondrial) in a column called MT:
data.var['MT']=data.var.index.str.startswith('MT-')
print('mitochodrial boolean added as a column')
display(data.var.head())

MT-ND1
MT-ND2
MT-CO1
MT-CO2
MT-ATP8


mitochodrial boolean added as a column


Unnamed: 0,MT
AL627309.1,False
AL627309.5,False
AL627309.4,False
AL669831.2,False
LINC01409,False


i wont add ribosomal genes because it needs a url to access the genes and i dont have internet :p

In [5]:
sc.pp.calculate_qc_metrics(data, qc_vars=['MT'], percent_top=None, log1p=False, inplace=True)
#now look what happens :p
display(data.var.head())
print('notice how high are % drop out')

Unnamed: 0,MT,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts
AL627309.1,False,8,0.001312,99.868831,8.0
AL627309.5,False,33,0.005411,99.458928,33.0
AL627309.4,False,4,0.000656,99.934415,4.0
AL669831.2,False,2,0.000328,99.967208,2.0
LINC01409,False,274,0.048697,95.50746,297.0


notice how high are % drop out


now the cells

In [6]:
print('even obs have new cols now for mitochondrial counts')
display(data.obs.head())

even obs have new cols now for mitochondrial counts


Unnamed: 0,n_genes_by_counts,total_counts,total_counts_MT,pct_counts_MT
TAGGTACCATGGCCAC-1_1,6764,24256.0,153.0,0.630772
ATTCACTGTAACAGGC-1_1,6686,19020.0,404.0,2.12408
TAACTTCCAACCACGC-1_1,5158,15645.0,221.0,1.412592
TTGGGTACACGACAAG-1_1,4900,15220.0,3.0,0.019711
AGGCCACAGAGTCACG-1_1,5544,13217.0,417.0,3.155028


: 

In [7]:
data.var.sort_values('n_cells_by_counts').head()
print('many genes arent present, 0 or ~0 cells, we will remove those found in less than 3 cells')

#filter genes function takes cells as argument
#filter cells function takes genes as arg
sc.pp.filter_genes(data,min_cells=3)
# -- im assuming pp means for preprocessing
display(data.var.head())

many genes arent present, 0 or ~0 cells, we will remove those found in less than 3 cells


typically we would filter by counts (total counts for eah cell), but we can clearly see - when sorted - that the data is already filetred by the paper's authors (less than 400 removed...)

In [None]:
data.obs.sort_values('total_counts').head()

We will look for outliers

In [None]:
sc.pp.violin(data, ['n_genes_by_count','total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)
print('now we can get rid of outliers; notice how genes by count and total counts are very highly correlated => can filter on one')
print('usually mitochondrion filter is set anywhere from 5 til 20%')


In [None]:
import numpy as np #to filter by percentile
# -- filtering genes by 98th percentile (objectively picking a value as threhold)
upper_lim = np.quantile(data.obs.n_genes_by_counts.values, .98)
print('and this is teh 98th percentile value that we will take as thresh for filtration:', upper_lim)
data =  data[(data.obs.n_genes_by_counts < upper_lim)]

print('20% for mit')
data = data[(data.obs.pct_counts_mt < 20)]

now data is filtered of outliers :)

### Normalization

same sum

In [None]:
print('look at the soum of ounts for each cell:',data.X.sum(axis=1))

print('first thing: normalize counts in each cell so that total ccounts adds up to the same value')
sc.pp.normalize_total(data, target_sum=1e4) # to 10,000 UMI
print('check this out now :)', data.X.sum(axis=1))

log transformation

In [None]:
print('making them on log counts (non linear transformation, they will be more comparable than before)')
sc.pp.log1p(data)

## Analysis