Imputing the missing columns


In [None]:
from sklearn.impute import SimpleImputer
import ast  # Module to safely evaluate literal strings as Python expressions

# Assuming 'data' is your DataFrame
column_to_impute = 'pre_morph_embeddings'

data[column_to_impute].fillna(0, inplace=True)

data.info()

Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# List of categorical columns
categorical_columns = ['compartment', 'pre_brain_area','post_brain_area']

# Apply label encoding to each categorical column
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

# Display the updated DataFrame
data.info()

Euclidean Distances and interaction terms

In [None]:
# Calculating the Eulidean distances between the axonal and dendritic coordinates



# Calculate Euclidean distances and create a new column 'euclidean_distance'
data['euclidean_distance'] = np.sqrt(
    (data['axonal_coor_x'] - data['dendritic_coor_x'])**2 +
    (data['axonal_coor_y'] - data['dendritic_coor_y'])**2 +
    (data['axonal_coor_z'] - data['dendritic_coor_z'])**2
)

# The 'euclidean_distance' column now contains the Euclidean distances

# Calculate Euclidean distances between axonal and dendritic coordinates for both pre and post neurons
data['axonal_distance_pre'] = np.sqrt((data['axonal_coor_x'] - data['dendritic_coor_x'])**2 + (data['axonal_coor_y'] - data['dendritic_coor_y'])**2)
data['axonal_distance_post'] = np.sqrt((data['axonal_coor_x'] - data['dendritic_coor_x'])**2 + (data['axonal_coor_y'] - data['dendritic_coor_y'])**2)

data['dendritic_distance_pre'] = np.sqrt((data['dendritic_coor_x'] - data['axonal_coor_x'])**2 + (data['dendritic_coor_y'] - data['axonal_coor_y'])**2)
data['dendritic_distance_post'] = np.sqrt((data['dendritic_coor_x'] - data['axonal_coor_x'])**2 + (data['dendritic_coor_y'] - data['axonal_coor_y'])**2)

# Combine these distances with Euclidean distances between readout locations
data['euclidean_distance_pre'] = np.sqrt((data['pre_rf_x'] - data['post_rf_x'])**2 + (data['pre_rf_y'] - data['post_rf_y'])**2)
data['euclidean_distance_post'] = np.sqrt((data['pre_rf_x'] - data['post_rf_x'])**2 + (data['pre_rf_y'] - data['post_rf_y'])**2)

# Dataset contains both structural (axonal and dendritic distances) and spatial (Euclidean distances) features.


# Calculate the relative positions of post-synaptic neurons with respect to pre-synaptic neurons
data['relative_position_x'] = data['post_rf_x'] - data['pre_rf_x']
data['relative_position_y'] = data['post_rf_y'] - data['pre_rf_y']

# dataset contains "relative_position_x" and "relative_position_y" features

# Create binary directional features
data['is_left'] = (data['relative_position_x'] < 0).astype(int)
data['is_right'] = (data['relative_position_x'] > 0).astype(int)
data['is_above'] = (data['relative_position_y'] < 0).astype(int)
data['is_below'] = (data['relative_position_y'] > 0).astype(int)

# Dataset contains binary directional features

# Create interaction terms between rf_x and rf_y for pre-synaptic neurons
data['pre_rf_x_rf_y_interaction'] = data['pre_rf_x'] * data['pre_rf_y']

# Create interaction terms between rf_x and rf_y for post-synaptic neurons
data['post_rf_x_rf_y_interaction'] = data['post_rf_x'] * data['post_rf_y']

# The dataset now contains interaction terms that capture relationships in visual space

Processing the morph embeddings

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['pre_morph_embeddings'] = data['pre_morph_embeddings'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)))
data['post_morph_embeddings'] = data['post_morph_embeddings'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)))

def flatten_arrays(arrays):
    return [item for sublist in arrays for item in sublist]

data['pre_morph_embeddings'] = data['pre_morph_embeddings'].apply(flatten_arrays)
data['post_morph_embeddings'] = data['post_morph_embeddings'].apply(flatten_arrays)
data['pre_morph_embeddings']

PCA on morph embeddings

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Ensure all arrays in 'pre_morph_embeddings' and 'post_morph_embeddings' have the same size
data['pre_morph_embeddings'] = data['pre_morph_embeddings'].apply(lambda x: np.array(x) if len(x) == 32 else np.array([0.0] * 32))
data['post_morph_embeddings'] = data['post_morph_embeddings'].apply(lambda x: np.array(x) if len(x) == 32 else np.array([0.0] * 32))

# Apply PCA
pca = PCA(n_components=2)
data['pre_morph_embeddings'] = list(pca.fit_transform(np.vstack(data['pre_morph_embeddings'])))
data['post_morph_embeddings'] = list(pca.fit_transform(np.vstack(data['post_morph_embeddings'])))
data['post_morph_embeddings']

PCA on feature weights

In [None]:
scaler = StandardScaler()
data['pre_feature_weights'] = data['pre_feature_weights'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)))
data['post_feature_weights'] = data['post_feature_weights'].apply(lambda x: scaler.fit_transform(np.array(x).reshape(-1, 1)))


def flatten_arrays(arrays):
    return [item for sublist in arrays for item in sublist]

data['pre_feature_weights'] = data['pre_feature_weights'].apply(flatten_arrays)
data['post_feature_weights'] = data['post_feature_weights'].apply(flatten_arrays)


# Apply PCApost_feature_weights
pca = PCA(n_components=2)
data['pre_feature_weights'] = list(pca.fit_transform(np.vstack(data['pre_feature_weights'])))
data['post_feature_weights'] = list(pca.fit_transform(np.vstack(data['post_feature_weights'])))



# # Extract feature weight columns
# feature_weight_columns = ['pre_feature_weights', 'post_feature_weights']

# # Standardize the data
# scaler = StandardScaler()
# data[feature_weight_columns] = scaler.fit_transform(data[feature_weight_columns])

# # Create a PCA object
# pca = PCA(n_components=2)
# data_pca = pca.fit_transform(data[feature_weight_columns])


Kurtosis, skew, stats - feature weights

In [None]:
from scipy.stats import skew, kurtosis

data['pre_feature_weights_sum'] = data['pre_feature_weights'].apply(lambda x: sum(x))
data['post_feature_weights_sum'] = data['post_feature_weights'].apply(lambda x: sum(x))
data['feature_weights_difference'] = abs(data['pre_feature_weights_sum'] - data['post_feature_weights_sum'])

data['pre_feature_weights_variance'] = data['pre_feature_weights'].apply(lambda x: np.var(x))
data['post_feature_weights_variance'] = data['post_feature_weights'].apply(lambda x: np.var(x))


data['pre_feature_weights_skewness'] = data['pre_feature_weights'].apply(lambda x: skew(x))
data['post_feature_weights_skewness'] = data['post_feature_weights'].apply(lambda x: skew(x))
data['pre_feature_weights_kurtosis'] = data['pre_feature_weights'].apply(lambda x: kurtosis(x))
data['post_feature_weights_kurtosis'] = data['post_feature_weights'].apply(lambda x: kurtosis(x))

Skew, stats, kurt for Morph embeddings

In [None]:
from scipy.stats import skew, kurtosis
import numpy as np


# Sum the morphological embeddings for pre and post neurons
data['pre_morph_embeddings_sum'] = data['pre_morph_embeddings'].apply(lambda x: np.sum(x, axis=0))
data['post_morph_embeddings_sum'] = data['post_morph_embeddings'].apply(lambda x: np.sum(x, axis=0))

# Calculate the Euclidean distance between summed morphological embeddings
data['morph_embeddings_distance'] = data.apply(lambda row: np.linalg.norm(row['pre_morph_embeddings_sum'] - row['post_morph_embeddings_sum']), axis=1)

# Calculate the variance of morphological embeddings
data['pre_morph_embeddings_variance'] = data['pre_morph_embeddings'].apply(lambda x: np.var(x, axis=0))
data['post_morph_embeddings_variance'] = data['post_morph_embeddings'].apply(lambda x: np.var(x, axis=0))

# Calculate the skewness and kurtosis of morphological embeddings
data['pre_morph_embeddings_skewness'] = data['pre_morph_embeddings'].apply(lambda x: skew(x, axis=0))
data['post_morph_embeddings_skewness'] = data['post_morph_embeddings'].apply(lambda x: skew(x, axis=0))
data['pre_morph_embeddings_kurtosis'] = data['pre_morph_embeddings'].apply(lambda x: kurtosis(x, axis=0))
data['post_morph_embeddings_kurtosis'] = data['post_morph_embeddings'].apply(lambda x: kurtosis(x, axis=0))


Cosine similarity

In [None]:
#cosine similarity function
def row_feature_similarity(row):
    pre = row["pre_feature_weights"]
    post = row["post_feature_weights"]
    return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

PCA with Explained variance on whole data set

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


# Exclude non-numeric and non-categorical columns from PCA
exclude_cols = ['ID',
                'pre_feature_weights',
                'post_feature_weights',
                'pre_morph_embeddings',
                'post_morph_embeddings',
                'projection_group',
                'connected',
                'is_left',
                'is_right',
                'is_above',
                'is_below'
               ]

# Select only numeric columns for PCA
numeric_columns = [col for col in data.columns if col not in exclude_cols]

# Standardize the data (if needed)
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Create a PCA object
pca = PCA()

# Fit PCA and transform the data
pca_result = pca.fit_transform(data[numeric_columns])

# Calculate explained variance
explained_variance = pca.explained_variance_ratio_

# Cumulative explained variance
cumulative_explained_variance = explained_variance.cumsum()

# Visualize explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-', color='b')
plt.title('Explained Variance with PCA')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()


In [None]:
# Apply PCA to data
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Exclude non-numeric and non-categorical columns from the PCA
exclude_cols = ['ID',
                'pre_feature_weights',
                'post_feature_weights',
                'pre_morph_embeddings',
                'post_morph_embeddings',
                'projection_group',
                'connected',
                'is_left',
                'is_right',
                'is_above',
                'is_below'
               ]

# Select only numeric columns for PCA
numeric_columns = [col for col in data.columns if col not in exclude_cols]
pca_num_of_components = 30

# Standardize the data (if needed)
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Create a PCA object (you can specify the number of components)
pca = PCA(n_components=pca_num_of_components)

# Fit and transform the data
pca_result = pca.fit_transform(data[numeric_columns])

# Create a DataFrame for visualization
pca_df = pd.DataFrame(data=pca_result, columns=[col for col in range(pca_num_of_components)])







K means clustering

In [None]:
from sklearn.cluster import KMeans

# Select only numeric columns for KMeans clustering
numeric_columns_for_clustering = [col for col in pca_df.columns if col != 'Cluster']

num_clusters = 2

# Create a KMeans object
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model to the PCA results
kmeans.fit(pca_df[numeric_columns_for_clustering])

# Add the cluster labels to the PCA DataFrame
pca_df['Cluster'] = kmeans.labels_

# Visualize the clusters in the scatter plot
plt.figure(figsize=(10, 6))
for cluster in range(num_clusters):
    cluster_data = pca_df[pca_df['Cluster'] == cluster]
    plt.scatter(cluster_data[0], cluster_data[1], label=f'Cluster {cluster}', alpha=0.7)

plt.title('K-means Clustering after PCA')
plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.legend()
plt.grid(True)
plt.show()
