In [2]:
import os
import pandas as pd
from scipy.io import mmread, mmwrite
from scipy.sparse import coo_matrix

# Define file paths
genes_file = r"E:\iiser_project\broad_filtered\merged_genes.tsv"
barcodes_file =  r"E:\iiser_project\broad_filtered\barcodes.tsv"
mtx_file = r"E:\iiser_project\broad_filtered\matrix.mtx"

# Load barcodes and extract patient IDs
barcodes = pd.read_csv(barcodes_file, header=None)
barcodes.columns = ['barcode']
barcodes['patient_id'] = barcodes['barcode'].apply(lambda x: x.split('_')[0])
unique_patients = barcodes['patient_id'].unique()

# Load the matrix
mtx = mmread(mtx_file).tocsc()  # Convert to CSC format for efficient column slicing

# Loop through each patient
for patient_id in unique_patients:
    # Create folder for each patient
    patient_folder = f"./{patient_id}"
    os.makedirs(patient_folder, exist_ok=True)
    
    # Filter barcodes for the current patient
    patient_barcodes = barcodes[barcodes['patient_id'] == patient_id]
    patient_barcode_indices = patient_barcodes.index.values
    
    # Extract the patient-specific matrix columns
    patient_mtx = mtx[:, patient_barcode_indices]
    
    # Save the patient-specific files
    patient_barcodes[['barcode']].to_csv(f"{patient_folder}/barcodes.tsv", header=False, index=False)
    pd.read_csv(genes_file, sep="\t").to_csv(f"{patient_folder}/genes.tsv", sep="\t", index=False)
    mmwrite(f"{patient_folder}/matrix.mtx", patient_mtx)

print("Patient-specific folders with matrix files created successfully.")


Patient-specific folders with matrix files created successfully.


In [3]:
print("Current working directory:", os.getcwd())


Current working directory: C:\Users\saurabh shekhar


In [1]:
import pandas as pd

# Load the barcodes file
barcodes_file_path = r"E:\iiser_project\broad_filtered\patient_mtx_features_barcodes\CID3586\barcodes.tsv"

# Read the barcodes file
barcodes = pd.read_csv(barcodes_file_path, header=None)

# Count the number of rows, which corresponds to the number of cells
num_cells = barcodes.shape[0]

print(f"Number of cells: {num_cells}")


Number of cells: 6178


In [8]:
import pandas as pd
from scipy.io import mmread

# File paths
barcodes_file = r"E:\iiser_project\broad_filtered\patient_mtx_features_barcodes\CID3586\barcodes.tsv"
genes_file = r"E:\iiser_project\broad_filtered\patient_mtx_features_barcodes\CID3586\genes.tsv"
matrix_file = r"E:\iiser_project\broad_filtered\patient_mtx_features_barcodes\CID3586\matrix.mtx"

# Load the barcodes file (each row represents a cell)
barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')
barcodes.columns = ['Barcode']

# Load the genes file (Ensembl ID and Gene Name)
genes = pd.read_csv(genes_file, sep='\t')

# Load the matrix file (sparse matrix)
matrix = mmread(matrix_file).tocsc()

# Filter for YAP1 and WWTR1 (TAZ) based on Gene_ID
yap1_id = 'ENSG00000137693'  # YAP1 Ensembl ID
wwtr1_id = 'ENSG00000018408'  # WWTR1 (TAZ) Ensembl ID

# Get the index of YAP1 and WWTR1 in the genes DataFrame
yap1_idx = genes[genes['Gene_ID'] == yap1_id].index[0]
wwtr1_idx = genes[genes['Gene_ID'] == wwtr1_id].index[0]

# Extract the expression data for YAP1 and WWTR1
yap1_expression = matrix[yap1_idx, :]
wwtr1_expression = matrix[wwtr1_idx, :]

# Count non-zero expression for YAP1 and WWTR1 (indicating cells expressing these genes)
yap1_cells = yap1_expression.nonzero()[1]
wwtr1_cells = wwtr1_expression.nonzero()[1]

# Get the barcodes of cells that express YAP1 and WWTR1
yap1_barcodes = barcodes.iloc[yap1_cells]
wwtr1_barcodes = barcodes.iloc[wwtr1_cells]

# Display the results
print(f"Total number of cells expressing YAP1 in patient cid 3586: {yap1_barcodes.shape[0]}")
print(f"Total number of cells expressing WWTR1 in patient cid 3586: {wwtr1_barcodes.shape[0]}")

coexpressed_cells = set(yap1_cells).intersection(set(wwtr1_cells))
coexpressed_barcodes = barcodes.iloc[list(coexpressed_cells)]

# Display the result
print(f"Total number of cells co-expressing YAP1 and WWTR1: {coexpressed_barcodes.shape[0]}")


print("line")

Total number of cells expressing YAP1 in patient cid 3586: 5
Total number of cells expressing WWTR1 in patient cid 3586: 283
Total number of cells co-expressing YAP1 and WWTR1: 3


In [9]:
# Find common cells expressing both YAP1 and WWTR1
common_cells = set(yap1_cells).intersection(set(wwtr1_cells))

# Find cells expressing only YAP1
only_yap1_cells = set(yap1_cells) - set(wwtr1_cells)

# Find cells expressing only WWTR1
only_wwtr1_cells = set(wwtr1_cells) - set(yap1_cells)

# Get the barcodes for each category
common_barcodes = barcodes.iloc[list(common_cells)]
only_yap1_barcodes = barcodes.iloc[list(only_yap1_cells)]
only_wwtr1_barcodes = barcodes.iloc[list(only_wwtr1_cells)]

# Display the results
print(f"Total number of cells expressing both YAP1 and WWTR1 in patient CID3586: {len(common_cells)}")
print(f"Total number of cells expressing only YAP1 in patient CID3586: {len(only_yap1_cells)}")
print(f"Total number of cells expressing only WWTR1 in patient CID3586: {len(only_wwtr1_cells)}")


Total number of cells expressing both YAP1 and WWTR1 in patient CID3586: 3
Total number of cells expressing only YAP1 in patient CID3586: 2
Total number of cells expressing only WWTR1 in patient CID3586: 280


In [3]:
num_cells = barcodes.shape[0]

print(f"Number of cells: {num_cells}")

Number of cells: 6178


In [4]:
import os
import pandas as pd
from scipy.io import mmread

# Directory containing patient folders
base_dir = r"E:\iiser_project\broad_filtered\patient_mtx_features_barcodes"

# Target gene IDs for YAP1 and WWTR1 (TAZ)
yap1_id = 'ENSG00000137693'  # YAP1 Ensembl ID
wwtr1_id = 'ENSG00000018408'  # WWTR1 (TAZ) Ensembl ID

# Initialize dictionaries to store results
yap1_counts = {}
wwtr1_counts = {}

# Iterate through each patient folder in the base directory
for patient_folder in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_folder)
    
    # File paths for the current patient
    barcodes_file = os.path.join(patient_path, 'barcodes.tsv')
    genes_file = os.path.join(patient_path, 'genes.tsv')
    matrix_file = os.path.join(patient_path, 'matrix.mtx')
    
    # Load data files
    barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')
    barcodes.columns = ['Barcode']
    genes = pd.read_csv(genes_file, sep='\t')
    matrix = mmread(matrix_file).tocsc()
    
    # Find indices of YAP1 and WWTR1 in genes file
    try:
        yap1_idx = genes[genes['Gene_ID'] == yap1_id].index[0]
        wwtr1_idx = genes[genes['Gene_ID'] == wwtr1_id].index[0]
    except IndexError:
        print(f"YAP1 or WWTR1 not found in genes file for {patient_folder}. Skipping.")
        continue

    # Extract expression data
    yap1_expression = matrix[yap1_idx, :]
    wwtr1_expression = matrix[wwtr1_idx, :]

    # Count non-zero expressions (cells expressing the genes)
    yap1_cells = yap1_expression.nonzero()[1]
    wwtr1_cells = wwtr1_expression.nonzero()[1]

    # Store the count of cells expressing YAP1 and WWTR1 for each patient
    yap1_counts[patient_folder] = len(yap1_cells)
    wwtr1_counts[patient_folder] = len(wwtr1_cells)

# Display the results
for patient, yap1_count in yap1_counts.items():
    print(f"Patient {patient}: YAP1 cells = {yap1_count}, WWTR1 cells = {wwtr1_counts[patient]}")


Patient CID3586: YAP1 cells = 5, WWTR1 cells = 283
Patient CID3838: YAP1 cells = 47, WWTR1 cells = 225
Patient CID3921: YAP1 cells = 8, WWTR1 cells = 478
Patient CID3941: YAP1 cells = 0, WWTR1 cells = 59
Patient CID3946: YAP1 cells = 0, WWTR1 cells = 11
Patient CID3948: YAP1 cells = 1, WWTR1 cells = 84
Patient CID3963: YAP1 cells = 2, WWTR1 cells = 230
Patient CID4040: YAP1 cells = 20, WWTR1 cells = 197
Patient CID4066: YAP1 cells = 164, WWTR1 cells = 1159
Patient CID4067: YAP1 cells = 4, WWTR1 cells = 408
Patient CID4290A: YAP1 cells = 12, WWTR1 cells = 1462
Patient CID4398: YAP1 cells = 12, WWTR1 cells = 42
Patient CID44041: YAP1 cells = 108, WWTR1 cells = 246
Patient CID4461: YAP1 cells = 0, WWTR1 cells = 66
Patient CID4463: YAP1 cells = 4, WWTR1 cells = 304
Patient CID4465: YAP1 cells = 20, WWTR1 cells = 85
Patient CID4471: YAP1 cells = 154, WWTR1 cells = 2019
Patient CID4495: YAP1 cells = 9, WWTR1 cells = 906
Patient CID44971: YAP1 cells = 99, WWTR1 cells = 1462
Patient CID44991: 

In [12]:
import os
import pandas as pd
from scipy.io import mmread

# Directory containing patient folders
base_dir = r"E:\iiser_project\broad_filtered\patient_mtx_features_barcodes"

# Target gene IDs for YAP1 and WWTR1 (TAZ)
yap1_id = 'ENSG00000137693'  # YAP1 Ensembl ID
wwtr1_id = 'ENSG00000018408'  # WWTR1 (TAZ) Ensembl ID

# Initialize a list to store results for each patient
results = []

# Iterate through each patient folder in the base directory
for patient_folder in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_folder)
    
    # File paths for the current patient
    barcodes_file = os.path.join(patient_path, 'barcodes.tsv')
    genes_file = os.path.join(patient_path, 'genes.tsv')
    matrix_file = os.path.join(patient_path, 'matrix.mtx')
    
    # Load data files
    barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')
    barcodes.columns = ['Barcode']
    genes = pd.read_csv(genes_file, sep='\t')
    matrix = mmread(matrix_file).tocsc()
    
    # Find indices of YAP1 and WWTR1 in genes file
    try:
        yap1_idx = genes[genes['Gene_ID'] == yap1_id].index[0]
        wwtr1_idx = genes[genes['Gene_ID'] == wwtr1_id].index[0]
    except IndexError:
        print(f"YAP1 or WWTR1 not found in genes file for {patient_folder}. Skipping.")
        continue

    # Extract expression data
    yap1_expression = matrix[yap1_idx, :]
    wwtr1_expression = matrix[wwtr1_idx, :]

    # Count non-zero expressions (cells expressing the genes)
    yap1_cells = yap1_expression.nonzero()[1]
    wwtr1_cells = wwtr1_expression.nonzero()[1]

    common_cells = set(yap1_cells).intersection(set(wwtr1_cells))
    only_yap1_cells = set(yap1_cells) - set(wwtr1_cells)
    only_wwtr1_cells = set(wwtr1_cells) - set(yap1_cells)

   # Append the detailed results to the list
    results.append({
        'Patient': patient_folder,
        'Total_YAP1_Cell_Count': len(yap1_cells),
        'Total_WWTR1_Cell_Count': len(wwtr1_cells),
        'Both_YAP1_WWTR1_Cell_Count': len(common_cells),
        'Only_YAP1_Cell_Count': len(only_yap1_cells),
        'Only_WWTR1_Cell_Count': len(only_wwtr1_cells)
    })
# Convert the list of results to a DataFrame and save as CSV
results_df = pd.DataFrame(results)
results_df.to_csv(r"E:\iiser_project\yap1_wwtr1_counts_details1.csv", index=False)

print("CSV file saved successfully.")


CSV file saved successfully.
