In [1]:
# Code to enable this notebook to import from libraries
import os
import sys
module_path = os.path.abspath(os.path.join('..\..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import seaborn as sns
import csv
from scripts.mockUtilities import *
from scripts.utilities import *

In [3]:
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import pywt
import pywt.data
from sklearn.cluster import SpectralClustering
from matplotlib.colors import ListedColormap
from scipy.sparse import csr_matrix
from scipy.io import mmread
from sklearn.mixture import GaussianMixture
%matplotlib inline

In [4]:
# Set seed for reproducibility
np.random.seed(999)

## Loading Data

In [5]:
counts = mmread(r'..\..\data\10x_visium_LIBD\sample9\counts_hvg.mtx')
counts_sct = mmread(r'..\..\data\10x_visium_LIBD\sample9\sct_transformed_counts.mtx')

coords = pd.read_csv(r'..\..\data\10x_visium_LIBD\sample9\coords.csv')
meta_data = pd.read_csv(r'..\..\data\10x_visium_LIBD\sample9\meta.csv')

In [6]:
coords = coords.rename(columns={'x_coord': 'x', 'y_coord': 'y'})

# counts data
cell_feature_ori = counts
cell_feature_ori = (cell_feature_ori.toarray())
meta_data["feature_ori"] =  [cell_feature_ori[:,x] for x in range(meta_data.shape[0])] 

# sctransformed counts data
cell_feature_sct = counts_sct
cell_feature_sct = (cell_feature_sct.toarray())
meta_data["feature_sct"] =  [cell_feature_sct[:,x] for x in range(meta_data.shape[0])] 

meta_data = meta_data[['barcode', 'feature_sct', 'feature_ori', 'spatialLIBD']]
rna = pd.concat([coords, meta_data], axis=1)
rna['original index'] = rna.index
rna = rna.dropna()
rna['spatialLIBDCode'] = rna['spatialLIBD'].astype('category').cat.codes
rna = rna.reset_index(drop=True)

## Processing data

We will be using the feature_sct data (sctranformed gene expression)

In [7]:
# Set the settings for the data
N_GENES = len(rna["feature_sct"][0])

# Set the settings for the gridding of the data
MIN_X = 130
MAX_X = 500
MIN_Y = -510
MAX_Y = -100
N_X_INDICES = 2**6
N_Y_INDICES = N_X_INDICES
N_CELLS = N_X_INDICES * N_Y_INDICES

# Calculate the size of each cell in the grid
cell_size_x = (MAX_X - MIN_X) / N_X_INDICES
cell_size_y = (MAX_Y - MIN_Y) / N_Y_INDICES

# Assign each (x, y) point to a grid cell
rna['grid_x'] = ((rna['x'] - MIN_X) // cell_size_x).astype(int)
rna['grid_y'] = ((rna['y'] - MIN_Y) // cell_size_y).astype(int)

# Group by grid cells (grid_x, grid_y) and calculate the average feature vector for each cell
grouped = rna.groupby(['grid_y', 'grid_x'])['feature_sct'].apply(lambda x: np.mean(np.vstack(x), axis=0))
fill_feature_vector = np.zeros(N_GENES)
full_index = pd.MultiIndex.from_product([range(N_Y_INDICES), range(N_X_INDICES)], names=['y_index', 'x_index'])
grouped_reindexed = grouped.reindex(full_index, fill_value=fill_feature_vector)
grid_rna = pd.DataFrame(grouped_reindexed.tolist(), index=grouped_reindexed.index)

# Rename the columns to gene_indicator_n where n is the element index of the feature vector
grid_rna.columns = [f'gene_indicator_{i}' for i in range(grid_rna.shape[1])]

# Reset index to have y_index and x_index as named columns
grid_rna.index.names = ['y_index', 'x_index']

In [8]:
result_rna_unnormalised = grid_rna.copy()
result_rna_unnormalised['feature'] = result_rna_unnormalised.apply(lambda row: row.tolist(), axis=1)
result_rna_unnormalised = result_rna_unnormalised.drop(result_rna_unnormalised.columns[:-1], axis=1)
result_rna_unnormalised = result_rna_unnormalised.reset_index()

In [9]:
# Perform standardization 
from sklearn.preprocessing import StandardScaler
result_rna = result_rna_unnormalised.copy()
matrix = result_rna_unnormalised['feature'].tolist()
matrix = np.array(matrix)
scaler = StandardScaler()
matrix_standardized = scaler.fit_transform(matrix)
standardized_features = matrix_standardized.tolist()
result_rna['feature'] = standardized_features

In [10]:
assert np.allclose(matrix_standardized.mean(axis=0),np.array([0.0]*N_GENES))
assert np.allclose(matrix_standardized.std(axis=0),np.array([1.0]*N_GENES))

In [11]:
# Export standardised data
pd.DataFrame(matrix_standardized).to_csv('visium.csv', index=False)