In [None]:
!pip install mygene

In [None]:
import mygene
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import statsmodels.api as sm
from sklearn.decomposition import PCA
%matplotlib inline

# Let's focus on transcriptomics data first. We can compare gene expression across patients with and without ALS; Bulbar onset vs limbs onset patients; Rapid vs slow progress.

We can generate a huge count matrix with all participants

In [None]:
path_to_transcriptomics_data = '../input/end-als/end-als/transcriptomics-data/L3_counts/'
all_samples = os.listdir(path_to_transcriptomics_data)
print(f'Number of samples: {len(all_samples)}')

In [None]:
# load indnvidual example
def get_single_example_path(samplename):
    # not the best code to iterate over subfolders
    for root, dirs, files in os.walk(path_to_transcriptomics_data+samplename):
        for name in files:
            # whatever, pick first
            return(root+'/'+name)

def load_single_example(samplename):
    path_to_case_counts = get_single_example_path(samplename)
    # also remove CASE- and CONTROL- from samplename
    samplename = samplename.split('-')[1]
    data = pd.read_csv(path_to_case_counts, sep='\t', skiprows=1)
    # we want to make out life simple for now and take only first columns (geneid)
    # and last column
    data = data.rename(columns={data.columns.values[-1]: samplename})
    data.index = data['Geneid']
    # I should probably cast gene counts to uint8 or something...
    data = data.drop(['Geneid'], axis=1)
    return data[[samplename]]

In [None]:
# simple load everything
count_table = [load_single_example(sample) for sample in all_samples]
count_table = pd.concat(count_table, axis=1)

In [None]:
# just remove some low expressed genes
min_count_thrs = 100
count_table = count_table.loc[count_table.sum(axis=1) >= min_count_thrs, :]

In [None]:
samples_classes = pd.read_csv('../input/end-als/end-als/clinical-data/filtered-metadata/metadata/aals_participants.csv')
dataportal_datatable = pd.read_csv('../input/end-als/end-als/clinical-data/filtered-metadata/metadata/aals_dataportal_datatable.csv')
samples_classes = pd.merge(dataportal_datatable, samples_classes)
samples_classes = samples_classes.loc[samples_classes['GUID'].isin(count_table.columns.values), :]
samples_classes.index = samples_classes['GUID']
samples_classes = samples_classes.loc[count_table.columns, :]
samples_classes.head()

Find genes with overdispersion - genes which have high variance for their mean expression. That helps in scRNA data to find potential genes of interest, maybe it can help here

In [None]:
# simple normalization
count_table = 10000*count_table.apply(lambda x: x/sum(x), axis=0)
# inspired by http://pklab.med.harvard.edu/scw2014/subpop_tutorial.html
mean_expression = count_table.mean(axis=1)
var_expression = count_table.var(axis=1)
sd_expression = count_table.std(axis=1)
coeffient_variation = sd_expression / mean_expression
plt.scatter(np.log(mean_expression), np.log(coeffient_variation), s=0.05)
plt.xlabel('log_mean_expression')
plt.ylabel('log_coeffient_variation')

We fit a piece-wise function log(coeffient_variation) ~ log(mean_log_expression) and look at the residuals for each gene - genes with large residuals will be our higly variable genes. They can be variable purely due to techical reasons (batch effect), confounding factors (sex? race? age?) and so on but we can deal with that
(inspired by https://rawgit.com/ChristophH/sctransform/supp_html/supplement/variance_stabilizing_transformation.html and http://pklab.med.harvard.edu/scw2014/subpop_tutorial.html)

In [None]:
# simple split into bins
upper_bound = np.round(np.max(np.log(mean_expression)))
lower_bound = np.round(np.min(np.log(mean_expression)))
n_bins = 5
split_bins = np.linspace(lower_bound, upper_bound, n_bins)
# make DF with cv - expression
gene_variance_df = pd.DataFrame({
    'log_mean_expression': np.log(mean_expression), 
    'log_coeffient_variation': np.log(coeffient_variation), 
    'predicted_log_coeffient_variation': 0})
gene_variance_df = gene_variance_df.sort_values(['log_mean_expression'])
# assign bin to each gene
gene_variance_df['bin'] = gene_variance_df['log_mean_expression'].apply(lambda x: sum(x>split_bins))
bins_total = gene_variance_df['bin'].unique()

Now fit these piece-wise regression, control the value for slope in each case. We will also keep all predicted values so we can compare them with observed

In [None]:
bins_to_keep = []
plt.scatter(np.log(mean_expression), np.log(coeffient_variation), s=0.05)
for bin_idx in bins_total:
    # fit individual model for each bin
    exog = sm.add_constant(gene_variance_df['log_mean_expression'][gene_variance_df['bin']==bin_idx], prepend=False)
    mod = sm.GLM(gene_variance_df['log_coeffient_variation'][gene_variance_df['bin']==bin_idx], exog)
    res = mod.fit()
    # get predictins for some examples of mean expression, just to plot the line
    expression_space = np.linspace(split_bins[bin_idx-1], split_bins[bin_idx])
    predicted_line = mod.predict(res.params,  sm.add_constant(expression_space, prepend=False))
    # and plot them
    plt.plot(expression_space, predicted_line, '--', c='k')
    # in case we are not happy with the slope (no different from zero), we ignore that bin
    pvalue_slope = res.pvalues['log_mean_expression']
    if pvalue_slope <= 0.001:
        print(f'bin {bin_idx}, pvalue {pvalue_slope} - is good')
        bins_to_keep.append(bin_idx)
    else:
        print(f'bin {bin_idx}, pvalue {pvalue_slope} - drop')
    # get predictions for each bin
    log_cv_predicted_by_model = mod.predict(res.params)
    gene_variance_df.loc[gene_variance_df['bin']==bin_idx, 'predicted_log_coeffient_variation'] = log_cv_predicted_by_model
plt.xlabel('log_mean_expression');
plt.ylabel('log_coeffient_variation');
plt.title('Predicted values and observed');

In [None]:
gene_variance_df['residual'] = gene_variance_df['log_coeffient_variation'] - gene_variance_df['predicted_log_coeffient_variation']
gene_variance_df = gene_variance_df.loc[gene_variance_df['bin'].isin(bins_to_keep), :]
gene_variance_df = gene_variance_df.sort_values(['residual'], ascending=False)
gene_variance_df.head()

Ideally, now residuals shouldn't depend on the mean expression

In [None]:
exog = sm.add_constant(gene_variance_df['log_mean_expression'], prepend=False)
mod = sm.GLM(gene_variance_df['residual'], exog)
res = mod.fit()
print(res.summary())

In [None]:
plt.scatter(gene_variance_df['log_mean_expression'], gene_variance_df['residual'], s=0.05)
plt.xlabel('log_mean_expression');
plt.ylabel('coeffient_variation_residuals');

We are interested in genes with overdispersion => high residuals. We can think about distribution of residuals... and come up with a stat test to find residuals > 0. But for now we can simply look at their distrubution and come up with a trehshold

In [None]:
gene_variance_df['residual'].hist()

In [None]:
cut_off = 1.0
n_selected_genes = sum(gene_variance_df['residual']>=cut_off)
print(f'pick {n_selected_genes} genes with cut off of {cut_off}')

In [None]:
gene_variance_df_selected = gene_variance_df.loc[gene_variance_df['residual']>=cut_off, :]
# annotate genes!
mg = mygene.MyGeneInfo()
gene_annotations = [mg.getgene(x, fields='symbol') for x in gene_variance_df_selected.index.values]
gene_variance_df_selected.loc[:, 'symbol'] = [x['symbol'] if x is not None else None for x in gene_annotations]
gene_variance_df_selected.head()

Do we have any known markers?

In [None]:
# https://www.kaggle.com/alsgroup/end-als/discussion/231731
url_known_markers = 'https://raw.githubusercontent.com/chervov/genes/main/genes_ALS_from_alsod_ac_uk.csv'
known_markers = pd.read_csv(url_known_markers)
# any in know ones?
print(sum(gene_variance_df_selected['symbol'].isin(known_markers['Gene symbol'])))

Ok, make some PCA projections

In [None]:
count_table_tr = count_table.loc[gene_variance_df_selected.index, :]
pca = PCA(n_components=2)
projections = pca.fit_transform(count_table_tr.values.T)
sns.scatterplot(x=projections[:, 0], y=projections[:, 1], hue=samples_classes['Subject Group']);

What we see us quite poor separation of ALS - non-ALS. Well, it was expected after all

Can we fit a linear model to account for effects of age, sex, race, case-control and so on?

first, make dummy variables

In [None]:
samples_classes.loc[:, 'is_case'] = 0
samples_classes.loc[samples_classes['Case_Control']=='Case', 'is_case'] = 1

samples_classes.loc[:, 'is_male'] = 0
samples_classes.loc[samples_classes['Sex']=='Male', 'is_male'] = 1

age_fill_value = np.round(np.nanmean(samples_classes['Age at Sample Collection']))
age_fill_mask = pd.isna(samples_classes['Age at Sample Collection'])
samples_classes.loc[age_fill_mask, 'Age at Sample Collection'] = age_fill_value

Now iterate over all genes and see if any coefficient for `is_case` is different from zero

In [None]:
exog_s = samples_classes[['is_case', 'is_male', 'Age at Sample Collection']]
exog = sm.add_constant(exog_s, prepend=False)

for gene in count_table_tr.index.values:
    mod = sm.GLM(count_table_tr.loc[gene, :], exog_s)
    res = mod.fit()
    pvalue_case = res.pvalues['is_case']
    if pvalue_case <= 0.05:
        print(gene, pvalue_case, res.params['is_case'])

We really see only a few genes with non-zero coefficient for `is_case`.
Can we think about a different approach? We can take initial count data and run DESeq\limma to find DE genes

Good thing is that we are already provided with it!

In [None]:
# to do