In [75]:
import tarfile

In [76]:
file_path1 = "/Users/gajaj/Downloads/GRCh38.d1.vd1.fa.tar.gz"
output_dir1 = "/Users/gajaj/OneDrive/Documents/TUM/computational_single_cell/Gene-expression-changes-from-CNV/preprocessing/Multiome"

In [77]:
with tarfile.open(file_path1, "r") as tar:
    tar.extractall(path=output_dir1)
    print("Files extracted successfully")

Files extracted successfully


# Linear regression

In [2]:
import numpy as np
from collections import Counter
import pandas as pd
import sys
sys.path.append('/Users/gajaj/OneDrive/Documents/TUM/computational_single_cell/Gene-expression-changes-from-CNV/src/dataloader')
from embedding import embed

We get the embedding matrix as an input with following rows:
- 4 rows for one-hot-encoding
- CNV loss
- CNV gain
-open chromatin

## Functions for matrix aggregation

In [3]:
# Example setup: Generate a mock input matrix
# Rows: [one-hot encoding (4 rows for A, T, G, C), CNV loss, CNV gain, open chromatin]
# Columns: Nucleotide positions (e.g., 6000 columns for 6k bp)
np.random.seed(42)  # For reproducibility
n_positions = 6000
mock_matrix = np.random.randint(0, 2, size=(7, n_positions))

In [31]:
print(mock_matrix.shape) # a random matrix of 0 and 1
print(mock_matrix)

(7, 6000)
[[0 1 0 ... 1 1 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 1 1]
 ...
 [1 1 0 ... 1 1 0]
 [1 1 0 ... 1 1 1]
 [1 1 0 ... 0 1 0]]


In [63]:
mock_matrix1 = np.zeros((7, 6000))
mock_matrix1[1, :] = np.ones(6000)
mock_matrix1

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
def create_embedding_dataset(matrix, tss_position=1):
    """
    Create a dataset that combines codon frequencies and aggregated CNV/open chromatin metrics.

    Args:
        matrix (numpy.ndarray): The embedding matrix.
        tss_position (int): The starting position for codon counting (default is 1).

    Returns:
        numpy.ndarray: A 1-row dataset with 64 columns for codon counts and 3 columns for CNV/open chromatin metrics.
    """
    # Extract one-hot DNA rows (first 4 rows: A, T, G, C)
    nucleotides = ['A', 'T', 'G', 'C']
    one_hot_dna = matrix[:4, :]

    # Decode one-hot to nucleotide sequence
    sequence = ''.join(
        nucleotides[np.argmax(one_hot_dna[:, i])]
        for i in range(one_hot_dna.shape[1])
    )

    # Start codon counting from the TSS
    start = tss_position - 1
    codons = [
        sequence[i:i + 3]
        for i in range(start, len(sequence), 3)
        if i + 3 <= len(sequence)
    ]

    # Generate all 64 possible codons
    possible_codons = [a + b + c for a in nucleotides for b in nucleotides for c in nucleotides]

    # Count codon frequencies
    codon_counts = Counter(codons)

    # Ensure all 64 codons are represented in the output (with count 0 if absent)
    codon_counts_array = np.array([codon_counts.get(codon, 0) for codon in possible_codons])

    # Extract CNV and open chromatin rows (last 3 rows)
    cnv_loss = matrix[4, :] #TODO do we take the average across all embedding or also from TSS??
    cnv_gain = matrix[5, :]
    open_chromatin = matrix[6, :]

    # Compute averages
    cnv_loss_avg = np.mean(cnv_loss)
    cnv_gain_avg = np.mean(cnv_gain)
    open_chromatin_avg = np.mean(open_chromatin)

    # Combine codon frequencies and aggregated metrics into one dataset
    aggregated_values = np.array([cnv_loss_avg, cnv_gain_avg, open_chromatin_avg])
    final_dataset = np.concatenate([codon_counts_array, aggregated_values])

    # Create column headings
    codon_headings = possible_codons
    aggregated_headings = ['cnv_loss_avg', 'cnv_gain_avg', 'open_chromatin_avg']
    column_headings = codon_headings + aggregated_headings

    return final_dataset, column_headings


In [None]:
final_dataset, column_headings = create_embedding_dataset(mock_matrix)
print(final_dataset)
print(column_headings)

[361.         186.          65.          24.         134.
  64.          46.          23.          80.          29.
  17.           7.          35.          25.          15.
   3.         158.          66.          42.          14.
  67.          27.          15.          10.          39.
   9.           8.           2.          15.          10.
   7.           2.          73.          31.          20.
  11.          42.          22.          12.           4.
  12.           7.           1.           2.          13.
   3.           4.           0.          45.          22.
  12.           7.          17.           8.           1.
   0.           5.           3.           4.           2.
   4.           3.           5.           0.           0.5015
   0.50916667   0.49933333]
['AAA', 'AAT', 'AAG', 'AAC', 'ATA', 'ATT', 'ATG', 'ATC', 'AGA', 'AGT', 'AGG', 'AGC', 'ACA', 'ACT', 'ACG', 'ACC', 'TAA', 'TAT', 'TAG', 'TAC', 'TTA', 'TTT', 'TTG', 'TTC', 'TGA', 'TGT', 'TGG', 'TGC', 'TCA', 'TCT', 'TC

In [65]:
final_dataset, column_headings = create_embedding_dataset(mock_matrix1)
print(final_dataset)

[   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0. 2000.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.]


## Load data

Import dataset of gene positions

In [8]:
gene_df = pd.read_csv('/Users/gajaj/OneDrive/Documents/TUM/computational_single_cell/Gene-expression-changes-from-CNV/preprocessing/Multiome/gene_positions.csv')
gene_df = gene_df[['start', 'end', 'gene_id']] #We leave out the chromosome
gene_df

Unnamed: 0,start,end,gene_id
0,3069168,3438621,ENSG00000142611
1,5301928,5307394,ENSG00000284616
2,2403964,2413797,ENSG00000157911
3,5492978,5494674,ENSG00000260972
4,10054445,10054781,ENSG00000224340
...,...,...,...
78927,15119,17798,ENSG00000307722
78928,24947,37269,ENSG00000310401
78929,3633,31375,ENSG00000302039
78930,6913,19848,ENSG00000309831


Import classification dataset

In [9]:
classification_df = pd.read_csv('/Users/gajaj/OneDrive/Documents/TUM/computational_single_cell/Gene-expression-changes-from-CNV/preprocessing/Multiome/classification_median.tsv', sep="\t")
#classification_df = classification_df.set_index('gene_id')
classification_df

Unnamed: 0,barcode,gene_id,expression_count,classification
0,AAACAGCCAAACGCGA-2,ENSG00000235146,0.000000,zero
1,AAACAGCCAAACGCGA-2,ENSG00000187634,0.240150,low
2,AAACAGCCAAACGCGA-2,ENSG00000187583,0.000000,zero
3,AAACAGCCAAACGCGA-2,ENSG00000205231,0.000000,zero
4,AAACAGCCAAACGCGA-2,ENSG00000228750,0.000000,zero
...,...,...,...,...
27353995,TTTGTTGGTTGAGGTC-3,ENSG00000198886,1.907476,high
27353996,TTTGTTGGTTGAGGTC-3,ENSG00000198786,1.573612,high
27353997,TTTGTTGGTTGAGGTC-3,ENSG00000198727,3.001725,high
27353998,TTTGTTGGTTGAGGTC-3,ENSG00000277666,0.000000,zero


Match the datasets by the gene_ids

In [10]:
# Merge the datasets based on the gene_id column
merged_df = pd.merge(
    classification_df, gene_df,
    on="gene_id",  # Match rows based on this column
    how="inner"    # Keep only rows with matches in both datasets
)

# Display the result
print(merged_df)

                     barcode          gene_id  expression_count  \
0         AAACAGCCAAACGCGA-2  ENSG00000235146          0.000000   
1         AAACAGCCAAACGCGA-2  ENSG00000187634          0.240150   
2         AAACAGCCAAACGCGA-2  ENSG00000187583          0.000000   
3         AAACAGCCAAACGCGA-2  ENSG00000205231          0.000000   
4         AAACAGCCAAACGCGA-2  ENSG00000228750          0.000000   
...                      ...              ...               ...   
26697499  TTTGTTGGTTGAGGTC-3  ENSG00000212907          1.068847   
26697500  TTTGTTGGTTGAGGTC-3  ENSG00000198886          1.907476   
26697501  TTTGTTGGTTGAGGTC-3  ENSG00000198786          1.573612   
26697502  TTTGTTGGTTGAGGTC-3  ENSG00000198727          3.001725   
26697503  TTTGTTGGTTGAGGTC-3  ENSG00000273554          0.000000   

         classification    start      end  
0                  zero   587577   595116  
1                   low   923923   944575  
2                  zero   966482   975865  
3                  

## Create the covariate matrix and the response vector

We will have a **matrix X** that has per pair (cell_barcode, gene_id) a row of 67 inputs: 64 are the counts of the triplets and 3 are the averages of open cromatine, cnv gain, cnv loss over the nucleotides

There will be 2 **response vectors**:
- y1: has expression counts per pair (cell_barcode, gene_id)
- y2: classification of expression per pair (cell_barcode, gene_id)

In [11]:
gtf_path = '/Users/gajaj/OneDrive/Documents/TUM/computational_single_cell/Gene-expression-changes-from-CNV/preprocessing/Multiome/Homo_sapiens.GRCh38.113.gtf'
fasta_path = '/Users/gajaj/OneDrive/Documents/TUM/computational_single_cell/Gene-expression-changes-from-CNV/preprocessing/Multiome/GRCh38.d1.vd1.fa'
atac_path = '/Users/gajaj/OneDrive/Documents/TUM/computational_single_cell/Gene-expression-changes-from-CNV/preprocessing/Multiome/overlap_genes_peaks.tsv' 
cnv_path = ''
#list_of_embeded_matrices = embed()

In [12]:
embedding_matrix = mock_matrix

Per specific gene, we take its location start and end, then take the section of the embedding matrix for the sepcific gene

Aggregate the matrix

Stack this agergated matrix to the matrix X

In [81]:
X = np.zeros((len(merged_df), 67))
X.shape

(26697504, 67)

In [19]:
y1 = merged_df['expression_count'].to_list()
y2 = merged_df['classification'].to_list()

In [21]:
len(merged_df)

26697504

In [56]:
merged_df_head = merged_df.head(10)
merged_df_head

Unnamed: 0,barcode,gene_id,expression_count,classification,start,end
0,AAACAGCCAAACGCGA-2,ENSG00000235146,0.0,zero,587577,595116
1,AAACAGCCAAACGCGA-2,ENSG00000187634,0.24015,low,923923,944575
2,AAACAGCCAAACGCGA-2,ENSG00000187583,0.0,zero,966482,975865
3,AAACAGCCAAACGCGA-2,ENSG00000205231,0.0,zero,1173056,1179555
4,AAACAGCCAAACGCGA-2,ENSG00000228750,0.0,zero,6724626,6730012
5,AAACAGCCAAACGCGA-2,ENSG00000074800,0.433651,low,8861000,8879190
6,AAACAGCCAAACGCGA-2,ENSG00000171621,0.0,zero,9292894,9369532
7,AAACAGCCAAACGCGA-2,ENSG00000235263,0.0,zero,9500754,9503536
8,AAACAGCCAAACGCGA-2,ENSG00000162444,0.0,zero,9997206,10016021
9,AAACAGCCAAACGCGA-2,ENSG00000028137,0.0,zero,12166991,12209228


In [None]:
for index, row in merged_df_head.iterrows():
    start = row['start']

    embed_i = mock_matrix #TODO actual embedded matrix for specific cell, gene

    # Aggregate the matrix, and create a row that will be added to the matrix X
    X_row, _ = create_embedding_dataset(mock_matrix, start)
    X[index, :] = X_row



In [71]:
def create_matrix_X(merged_df):
    X_final = np.zeros((len(merged_df), 67))
    print(X)

    for index, row in merged_df.iterrows():
        start = row['start']

        embed_i = mock_matrix #TODO actual embedded matrix for specific cell, gene

        # Aggregate the matrix, and create a row that will be added to the matrix X
        X_row, _ = create_embedding_dataset(embed_i, start)
        X_final[index, :] = X_row

    return X_final

In [82]:
X_final = create_matrix_X(merged_df_head)
print(X_final.shape)

(10, 67)


In [84]:
y1_final = y1[:10]
y1_final

[0.0, 0.24015017, 0.0, 0.0, 0.0, 0.43365088, 0.0, 0.0, 0.0, 0.0]

## Linear regression 1

Fit a linear regression model with covariate matrix X and response vector y1

In [73]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [85]:
# Step 1: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y1_final, test_size=0.2, random_state=42)

In [86]:
# Step 2: Initialize and fit a Ridge regression model
ridge_model = Ridge(alpha=1.0)  # Regularization strength (alpha=0 means no regularization)
ridge_model.fit(X_train, y_train)

In [87]:
# Step 3: Make predictions
y_train_pred = ridge_model.predict(X_train)
y_test_pred = ridge_model.predict(X_test)

In [88]:
# Step 4: Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

In [89]:
print(train_mse)
print(test_mse)

0.020568306251147205
0.018756714970882847


: 

**QUESTIONS**:
- The gene expression can be different for same gene in different cells? It does not just refer to a gene_id?
- How do we know what is the end site for the embedding count, is it the gene_end?
- Is in the embedding the chromosome also important?