In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import io  # Use io.StringIO instead of pandas.compat.StringIO
import os

In [35]:
import pandas as pd

def extract_target_labels_for_luekemia_dataset(file_path, metadata_key="subtype"):
    """
    Extract target labels from the multiple '!Sample_characteristics_ch1' lines in the Series Matrix file.
    - file_path: Path to the Series Matrix file.
    - metadata_key: Key to extract (e.g., 'subtype').
    Returns:
        A list of labels for each sample.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Find all lines with '!Sample_characteristics_ch1'
    target_lines = [line for line in lines if line.startswith("!Sample_characteristics_ch1")]
    
    # Look for the line containing the specified metadata key
    key_line = next((line for line in target_lines if metadata_key in line), None)
    
    if not key_line:
        raise ValueError(f"The metadata key '{metadata_key}' was not found in any '!Sample_characteristics_ch1' line.")
    
    # Extract relevant labels from the key line
    labels = key_line.strip().split("\t")[1:]  # Skip the first column
    target_labels = [label.split(":")[-1].strip().replace('"', '') for label in labels]
    
    return target_labels

In [33]:
def extract_target_labels_for_lung_dataset(file_path, metadata_key="tissue"):
    """
    Extract target labels from the '!Sample_characteristics_ch1' line in the Series Matrix file.
    - file_path: Path to the Series Matrix file.
    - metadata_key: Key to extract (e.g., 'tissue').
    Returns:
        A list of labels for each sample.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Find the line with '!Sample_characteristics_ch1'
    target_line = next((line for line in lines if line.startswith("!Sample_characteristics_ch1")), None)
    print(target_line)
    
    if not target_line:
        raise ValueError("The '!Sample_characteristics_ch1' metadata line was not found in the file.")
    
    # Extract relevant labels from the target line
    labels = target_line.strip().split("\t")[1:]  # Skip the first column
    target_labels = []
    for label in labels:
        # Extract the value corresponding to the metadata key
        if metadata_key in label:
            extracted_label = label.split(":")[-1].strip().replace('"', '')
            target_labels.append(extracted_label)
    
    return target_labels

In [9]:
def extract_target_labels_for_tumor_dataset(file_path, metadata_key="Histopathological diagnostic"):
    """
    Extract specific target labels (e.g., astrocytoma, glioblastoma, oligodendroglioma) from
    the '!Sample_characteristics_ch1' line in the Series Matrix file.
    - file_path: Path to the Series Matrix file.
    - metadata_key: Key to extract (e.g., 'Histopathological diagnostic').
    Returns:
        A list of labels for each sample.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    # Find the line with '!Sample_characteristics_ch1'
    target_line = next((line for line in lines if line.startswith("!Sample_characteristics_ch1")), None)
    
    if not target_line:
        raise ValueError("The '!Sample_characteristics_ch1' metadata line was not found in the file.")
    
    # Extract relevant labels from the target line
    labels = target_line.strip().split("\t")[1:]  # Skip the first column
    target_labels = []
    for label in labels:
        # Extract the value corresponding to the metadata key
        if metadata_key in label:
            extracted_label = label.split(":")[-1].strip().replace('"', '').split(",")[0]  # Get the first part (e.g., astrocytoma)
            target_labels.append(extracted_label)
        else:
            raise ValueError(f"The metadata key '{metadata_key}' is not found in one or more entries.")
    
    return target_labels

In [31]:
def merge_target_with_preprocessed(preprocessed_file, target_labels, output_file):
    """
    Merge target labels with the preprocessed dataset.
    - preprocessed_file: Path to the preprocessed CSV file.
    - target_labels: List of target labels (one per sample).
    - output_file: Path to save the merged dataset.
    """
    # Load preprocessed data
    preprocessed_data = pd.read_csv(preprocessed_file, index_col=0)  # Ensure Probe IDs are preserved as the index

    # Transpose the dataset to make samples as rows
    preprocessed_data = preprocessed_data.T
    print(len(target_labels))
    print(preprocessed_data.shape)

    # Ensure the number of labels matches the number of samples (now rows after transposing)
    if len(target_labels) != preprocessed_data.shape[0]:
        raise ValueError("Number of target labels does not match the number of samples.")

    # Add target labels as a new column
    preprocessed_data["Target"] = target_labels

    # Save the updated dataset
    preprocessed_data.to_csv(output_file)
    print(f"Dataset with target labels saved to {output_file}")


In [11]:
def load_series_matrix(file_path):
    """
    Load numeric data from a GEO Series Matrix file, ignoring metadata.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Skip metadata lines starting with "!" or "^"
    data_lines = [line for line in lines if not line.startswith("!") and not line.startswith("^")]

    # Join filtered lines and read into pandas DataFrame
    df = pd.read_csv(
        io.StringIO("".join(data_lines)),
        sep="\t",
        index_col=0
    )
    
    # Ensure numeric data
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.empty:
        raise ValueError(f"The dataset {file_path} contains no numeric data after filtering.")
    
    return numeric_df


In [12]:
def preprocess_data(df):
    """
    Preprocess the gene expression data:
    - Remove rows/columns with too many missing values
    - Impute missing values
    - Normalize the data
    """
    # Drop rows and columns with more than 50% missing values
    df = df.dropna(axis=0, thresh=int(0.5 * df.shape[1]))
    df = df.dropna(axis=1, thresh=int(0.5 * df.shape[0]))

    if df.empty:
        raise ValueError("All rows or columns were dropped due to missing values.")
    
    # Impute missing values with the mean of each column
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), index=df.index, columns=df.columns)
    
    # Normalize the data using Min-Max Scaling
    scaler = MinMaxScaler()
    df_normalized = pd.DataFrame(scaler.fit_transform(df_imputed), index=df.index, columns=df.columns)
    
    return df_normalized


In [13]:
def save_preprocessed_data(df, output_path):
    """
    Save the preprocessed DataFrame to a CSV file.
    """
    df.to_csv(output_path, index=True)


In [41]:
def process_all_files(file_paths, output_dir):
    """
    Preprocess multiple Series Matrix files and save the results.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for file_path in file_paths:
        try:
            print(f"Processing {file_path}...")
            # Load the data
            data = load_series_matrix(file_path)

            #print(data)
            
            # Remove non-numeric columns (e.g., metadata)
            data_numeric = data.select_dtypes(include=[np.number])
            if data_numeric.empty:
                raise ValueError(f"The dataset {file_path} contains no numeric data.")
            print(f"Data loaded successfully with shape: {data_numeric.shape}")

           
            
            
            # Preprocess the data
            preprocessed_data = preprocess_data(data_numeric)
            
            # Generate output file name
            output_file = os.path.join(output_dir, f"preprocessed_{os.path.basename(file_path).replace('.txt', '.csv')}")
            
            # Save preprocessed data
            save_preprocessed_data(preprocessed_data, output_file)
            
            print(f"Preprocessed data saved to {output_file}")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# List of files to process
file_paths = [
    "E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE27562_series_matrix.txt",
    "E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE19804_series_matrix.txt",
    "E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE4290_series_matrix.txt",
    "E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE59856_series_matrix.txt",
    "E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE33315_series_matrix.txt"
]

# Output directory for preprocessed files
output_dir = "preprocessed_datasets"

if __name__ == "__main__":
    # Process all datasets
    process_all_files(file_paths, output_dir)


Processing E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE27562_series_matrix.txt...
Data loaded successfully with shape: (54675, 162)
Preprocessed data saved to preprocessed_datasets\preprocessed_GSE27562_series_matrix.csv
Processing E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE19804_series_matrix.txt...
Data loaded successfully with shape: (54675, 120)
Preprocessed data saved to preprocessed_datasets\preprocessed_GSE19804_series_matrix.csv
Processing E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE4290_series_matrix.txt...
Data loaded successfully with shape: (54613, 180)
Preprocessed data saved to preprocessed_datasets\preprocessed_GSE4290_series_matrix.csv
Processing E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproject/datasets/GSE59856_series_matrix.txt...
Data loaded successfully with shape: (2555, 571)
Preprocessed data saved to preprocessed_datasets\preprocessed_GSE59856_series_matrix.csv
Processing E:/MS SUBJECTS/3rdSem/Bioinformatics/bioproj

In [19]:
# File paths
series_matrix_path = "datasets/GSE19804_series_matrix.txt"
preprocessed_file = "preprocessed_datasets/preprocessed_GSE19804.csv"
output_file = "preprocessed/preprocessed_with_target_GSE19804.csv"

# Process GSE19804
metadata_key = "tissue"  # Key for the target variable
print("Extracting target labels...")
target_labels = extract_target_labels(series_matrix_path, metadata_key)

print(len(target_labels))

print("Merging target labels with preprocessed data...")
merge_target_with_preprocessed(preprocessed_file, target_labels, output_file)





Extracting target labels...
120
Merging target labels with preprocessed data...
Dataset with target labels saved to preprocessed/preprocessed_with_target_GSE19804.csv


In [14]:
# File paths
series_matrix_path = "datasets/GSE19804_series_matrix.txt"
preprocessed_file = "preprocessed_datasets/preprocessed_GSE19804.csv"
output_file = "preprocessed/preprocessed_with_target_GSE19804.csv"

# Process GSE19804
metadata_key = "tissue"  # Key for the target variable
print("Extracting target labels...")
target_labels = extract_target_labels_for_lung_dataset(series_matrix_path, metadata_key)

print(len(target_labels))

print("Merging target labels with preprocessed data...")
merge_target_with_preprocessed(preprocessed_file, target_labels, output_file)



Extracting target labels...


NameError: name 'extract_target_labels' is not defined

In [15]:
# merge target for tumor dataset
series_matrix_path = "datasets/GSE4290_series_matrix.txt"
preprocessed_file = "preprocessed_datasets/preprocessed_GSE4290.csv"
output_file = "preprocessed/preprocessed_with_target_GSE4290.csv"

# Process GSE4290
metadata_key = "Histopathological diagnostic"  # Key for the target variable
print(f"Extracting target labels from '!Sample_characteristics_ch1' using key '{metadata_key}'...")
target_labels = extract_target_labels_for_tumor_dataset(series_matrix_path, metadata_key)

print("Merging target labels with preprocessed data...")
merge_target_with_preprocessed(preprocessed_file, target_labels, output_file)

Extracting target labels from '!Sample_characteristics_ch1' using key 'Histopathological diagnostic'...
Merging target labels with preprocessed data...
Dataset with target labels saved to preprocessed/preprocessed_with_target_GSE4290.csv


In [19]:
# merge target for pancreatic dataset
series_matrix_path = "datasets/GSE59856_series_matrix.txt"
preprocessed_file = "preprocessed_datasets/preprocessed_GSE59856.csv"
output_file = "preprocessed/preprocessed_with_target_GSE59856.csv"

# Process GSE4290
metadata_key = "disease state"  # Key for the target variable
print(f"Extracting target labels from '!Sample_characteristics_ch1' using key '{metadata_key}'...")
target_labels = extract_target_labels_for_lung_dataset(series_matrix_path, metadata_key)

print("Merging target labels with preprocessed data...")
merge_target_with_preprocessed(preprocessed_file, target_labels, output_file)

Extracting target labels from '!Sample_characteristics_ch1' using key 'disease state'...
Merging target labels with preprocessed data...
Dataset with target labels saved to preprocessed/preprocessed_with_target_GSE59856.csv


In [21]:
# merge target for breast cancer dataset
series_matrix_path = "datasets/GSE27562_series_matrix.txt"
preprocessed_file = "preprocessed_datasets/preprocessed_GSE27562.csv"
output_file = "preprocessed/preprocessed_with_target_GSE27562.csv"

# Process GSE4290
metadata_key = "phenotype"  # Key for the target variable
print(f"Extracting target labels from '!Sample_characteristics_ch1' using key '{metadata_key}'...")
target_labels = extract_target_labels_for_lung_dataset(series_matrix_path, metadata_key)

print("Merging target labels with preprocessed data...")
merge_target_with_preprocessed(preprocessed_file, target_labels, output_file)

Extracting target labels from '!Sample_characteristics_ch1' using key 'phenotype'...
Merging target labels with preprocessed data...
Dataset with target labels saved to preprocessed/preprocessed_with_target_GSE27562.csv


In [36]:
# merge target for leukemia dataset
series_matrix_path = "datasets/GSE33315_series_matrix.txt"
preprocessed_file = "preprocessed_datasets/preprocessed_GSE33315.csv"
output_file = "preprocessed/preprocessed_with_target_GSE33315.csv"

# Process GSE4290
metadata_key = "subtype"  # Key for the target variable
print(f"Extracting target labels from '!Sample_characteristics_ch1' using key '{metadata_key}'...")
target_labels = extract_target_labels_for_luekemia_dataset(series_matrix_path, metadata_key)

print("Merging target labels with preprocessed data...")
merge_target_with_preprocessed(preprocessed_file, target_labels, output_file)

Extracting target labels from '!Sample_characteristics_ch1' using key 'subtype'...
Merging target labels with preprocessed data...
575
(575, 22283)
Dataset with target labels saved to preprocessed/preprocessed_with_target_GSE33315.csv


In [22]:
import pandas as pd

# Path to your dataset with the Target column
dataset_file = "preprocessed/preprocessed_with_target_GSE27562.csv"

# Load the dataset
data = pd.read_csv(dataset_file)

# Check if the Target column exists
if "Target" not in data.columns:
    raise ValueError("The 'Target' column is not present in the dataset.")

# Find unique values and their counts
unique_values = data["Target"].value_counts()

# Print unique values and their counts
print("Unique values in the Target column and their counts:")
print(unique_values)


Unique values in the Target column and their counts:
Target
Malignant                      51
Benign                         37
Normal                         31
Ectopic                        22
Post-Surgery                   15
Pre-Surgery (aka Malignant)     6
Name: count, dtype: int64


In [37]:
import pandas as pd

# Path to your dataset with the Target column
dataset_file = "preprocessed/preprocessed_with_target_GSE33315.csv"

# Load the dataset
data = pd.read_csv(dataset_file)

# Check if the Target column exists
if "Target" not in data.columns:
    raise ValueError("The 'Target' column is not present in the dataset.")

# Find unique values and their counts
unique_values = data["Target"].value_counts()

# Print unique values and their counts
print("Unique values in the Target column and their counts:")
print(unique_values)


Unique values in the Target column and their counts:
Target
7_Other           153
1_Hyperdiploid    116
3_ETV6_RUNX1       99
8_T-ALL            83
2_TCF3-PBX1        40
4_MLL              30
5_Ph               23
6_Hypo             23
9_CD10CD19          4
10_CD34             4
Name: count, dtype: int64


In [20]:
import pandas as pd

# Path to your dataset with the Target column
dataset_file = "preprocessed/preprocessed_with_target_GSE59856.csv"

# Load the dataset
data = pd.read_csv(dataset_file)

# Check if the Target column exists
if "Target" not in data.columns:
    raise ValueError("The 'Target' column is not present in the dataset.")

# Find unique values and their counts
unique_values = data["Target"].value_counts()

# Print unique values and their counts
print("Unique values in the Target column and their counts:")
print(unique_values)


Unique values in the Target column and their counts:
Target
healthy control                                150
pancreatic cancer                              100
biliary tract cancer                            98
liver cancer                                    52
colon cancer                                    50
stomach cancer                                  50
esophagus cancer                                50
benign pancreatic or biliary tract diseases     21
Name: count, dtype: int64


In [23]:
import pandas as pd

# Path to your dataset with the Target column
dataset_file = "preprocessed/preprocessed_with_target_GSE19804.csv"

# Load the dataset
data = pd.read_csv(dataset_file)

# Check if the Target column exists
if "Target" not in data.columns:
    raise ValueError("The 'Target' column is not present in the dataset.")

# Find unique values and their counts
unique_values = data["Target"].value_counts()

# Print unique values and their counts
print("Unique values in the Target column and their counts:")
print(unique_values)


Unique values in the Target column and their counts:
Target
lung cancer               60
paired normal adjacent    60
Name: count, dtype: int64


In [16]:
import pandas as pd

# Path to your dataset with the Target column
dataset_file = "preprocessed/preprocessed_with_target_GSE4290.csv"

# Load the dataset
data = pd.read_csv(dataset_file)

# Check if the Target column exists
if "Target" not in data.columns:
    raise ValueError("The 'Target' column is not present in the dataset.")

# Find unique values and their counts
unique_values = data["Target"].value_counts()

# Print unique values and their counts
print("Unique values in the Target column and their counts:")
print(unique_values)


Unique values in the Target column and their counts:
Target
glioblastoma         77
oligodendroglioma    50
astrocytoma          26
non-tumor            23
Name: count, dtype: int64


In [2]:
import pandas as pd

# Path to dataset with target column
dataset_file = "preprocessed/preprocessed_with_target_GSE33315.csv"
output_file = "preprocessed/filtered_preprocessed_GSE33315.csv"

# Load dataset
data = pd.read_csv(dataset_file)

# Print original class distribution
print("Original class distribution:")
print(data["Target"].value_counts())

# Keep only classes with at least 20 samples
min_count = 20
filtered_data = data.groupby("Target").filter(lambda x: len(x) >= min_count)

# Save filtered dataset
filtered_data.to_csv(output_file, index=False)

# Print filtered class distribution
print("\nFiltered class distribution:")
print(filtered_data["Target"].value_counts())

print(f"\nFiltered dataset saved to {output_file}")


Original class distribution:
Target
7_Other           153
1_Hyperdiploid    116
3_ETV6_RUNX1       99
8_T-ALL            83
2_TCF3-PBX1        40
4_MLL              30
5_Ph               23
6_Hypo             23
9_CD10CD19          4
10_CD34             4
Name: count, dtype: int64

Filtered class distribution:
Target
7_Other           153
1_Hyperdiploid    116
3_ETV6_RUNX1       99
8_T-ALL            83
2_TCF3-PBX1        40
4_MLL              30
5_Ph               23
6_Hypo             23
Name: count, dtype: int64

Filtered dataset saved to preprocessed/filtered_preprocessed_GSE33315.csv


In [3]:
import pandas as pd

# Path to dataset with target column
dataset_file = "preprocessed/preprocessed_with_target_GSE27562.csv"
output_file = "preprocessed/filtered_preprocessed_GSE27562.csv"

# Load dataset
data = pd.read_csv(dataset_file)

# Print original class distribution
print("Original class distribution:")
print(data["Target"].value_counts())

# Keep only classes with at least 20 samples
min_count = 20
filtered_data = data.groupby("Target").filter(lambda x: len(x) >= min_count)

# Save filtered dataset
filtered_data.to_csv(output_file, index=False)

# Print filtered class distribution
print("\nFiltered class distribution:")
print(filtered_data["Target"].value_counts())

print(f"\nFiltered dataset saved to {output_file}")


Original class distribution:
Target
Malignant                      51
Benign                         37
Normal                         31
Ectopic                        22
Post-Surgery                   15
Pre-Surgery (aka Malignant)     6
Name: count, dtype: int64

Filtered class distribution:
Target
Malignant    51
Benign       37
Normal       31
Ectopic      22
Name: count, dtype: int64

Filtered dataset saved to preprocessed/filtered_preprocessed_GSE27562.csv


In [4]:
import pandas as pd

# Path to dataset with target column
dataset_file = "preprocessed/preprocessed_with_target_GSE4290.csv"
output_file = "preprocessed/filtered_preprocessed_GSE4290.csv"

# Load dataset
data = pd.read_csv(dataset_file)

# Print original class distribution
print("Original class distribution:")
print(data["Target"].value_counts())

# Keep only classes with at least 20 samples
min_count = 20
filtered_data = data.groupby("Target").filter(lambda x: len(x) >= min_count)

# Save filtered dataset
filtered_data.to_csv(output_file, index=False)

# Print filtered class distribution
print("\nFiltered class distribution:")
print(filtered_data["Target"].value_counts())

print(f"\nFiltered dataset saved to {output_file}")


Original class distribution:
Target
glioblastoma         77
oligodendroglioma    50
astrocytoma          26
non-tumor            23
Name: count, dtype: int64

Filtered class distribution:
Target
glioblastoma         77
oligodendroglioma    50
astrocytoma          26
non-tumor            23
Name: count, dtype: int64

Filtered dataset saved to preprocessed/filtered_preprocessed_GSE4290.csv


In [5]:
import pandas as pd

# Path to dataset with target column
dataset_file = "preprocessed/preprocessed_with_target_GSE19804.csv"
output_file = "preprocessed/filtered_preprocessed_GSE19804.csv"

# Load dataset
data = pd.read_csv(dataset_file)

# Print original class distribution
print("Original class distribution:")
print(data["Target"].value_counts())

# Keep only classes with at least 20 samples
min_count = 20
filtered_data = data.groupby("Target").filter(lambda x: len(x) >= min_count)

# Save filtered dataset
filtered_data.to_csv(output_file, index=False)

# Print filtered class distribution
print("\nFiltered class distribution:")
print(filtered_data["Target"].value_counts())

print(f"\nFiltered dataset saved to {output_file}")


Original class distribution:
Target
lung cancer               60
paired normal adjacent    60
Name: count, dtype: int64

Filtered class distribution:
Target
lung cancer               60
paired normal adjacent    60
Name: count, dtype: int64

Filtered dataset saved to preprocessed/filtered_preprocessed_GSE19804.csv


In [6]:
import pandas as pd

# Path to dataset with target column
dataset_file = "preprocessed/preprocessed_with_target_GSE59856.csv"
output_file = "preprocessed/filtered_preprocessed_GSE59856.csv"

# Load dataset
data = pd.read_csv(dataset_file)

# Print original class distribution
print("Original class distribution:")
print(data["Target"].value_counts())

# Keep only classes with at least 20 samples
min_count = 22
filtered_data = data.groupby("Target").filter(lambda x: len(x) >= min_count)

# Save filtered dataset
filtered_data.to_csv(output_file, index=False)

# Print filtered class distribution
print("\nFiltered class distribution:")
print(filtered_data["Target"].value_counts())

print(f"\nFiltered dataset saved to {output_file}")


Original class distribution:
Target
healthy control                                150
pancreatic cancer                              100
biliary tract cancer                            98
liver cancer                                    52
colon cancer                                    50
stomach cancer                                  50
esophagus cancer                                50
benign pancreatic or biliary tract diseases     21
Name: count, dtype: int64

Filtered class distribution:
Target
healthy control         150
pancreatic cancer       100
biliary tract cancer     98
liver cancer             52
colon cancer             50
stomach cancer           50
esophagus cancer         50
Name: count, dtype: int64

Filtered dataset saved to preprocessed/filtered_preprocessed_GSE59856.csv
