In [1]:
import pandas as pd

In [2]:
metadata_file = "GSE44076_metadata_all.csv"
metadata = pd.read_csv(metadata_file)

# Select Relevant Columns
relevant_columns = [
    "geo_accession",  # Unique sample ID
    "sample_type",    # Tissue type (tumor, normal, adjacent mucosa)
    "individual_id",  # Individual associated with the sample
    "stage",          # Cancer stage
    "location",       # Tumor location
    "gender",         # Gender of the individual
    "age",            # Age of the individual
    "status"          # Healthy or diseased status
]

metadata_filtered = metadata[relevant_columns]

print("Filtered Metadata:")
print(metadata_filtered.head())

Filtered Metadata:
  geo_accession            sample_type           individual_id        stage  \
0  "GSM1077598"  "sample type: Mucosa"  "individual id: A2119"  "Stage: --"   
1  "GSM1077599"  "sample type: Mucosa"  "individual id: A2142"  "Stage: --"   
2  "GSM1077600"  "sample type: Mucosa"  "individual id: B2104"  "Stage: --"   
3  "GSM1077601"  "sample type: Mucosa"  "individual id: B2127"  "Stage: --"   
4  "GSM1077602"  "sample type: Mucosa"  "individual id: B2150"  "Stage: --"   

            location            gender        age                   status  
0   "location: Left"    "gender: Male"  "age: 62"  "Public on Mar 14 2014"  
1   "location: Left"  "gender: Female"  "age: 77"  "Public on Mar 14 2014"  
2   "location: Left"  "gender: Female"  "age: 78"  "Public on Mar 14 2014"  
3  "location: Right"    "gender: Male"  "age: 65"  "Public on Mar 14 2014"  
4  "location: Right"  "gender: Female"  "age: 52"  "Public on Mar 14 2014"  


In [3]:
# Save Processed Metadata
metadata_filtered.to_csv("GSE44076_processed_metadata.csv", index=False)

In [4]:
# Remove double quotes from all cells
metadata_filtered = metadata_filtered.applymap(lambda x: str(x).replace('"', '') if isinstance(x, str) else x)

# Remove unwanted substrings from all data cells
substrings_to_remove = ["individual id:", "Stage:", "gender:","age:", "location:", "sample type:"]

# Function to clean substrings in a cell
def clean_cell(cell):
    if isinstance(cell, str):  # Only process strings
        for substring in substrings_to_remove:
            cell = cell.replace(substring, '').strip()  # Remove and strip extra spaces
    return cell

# Apply the cleaning function to all cells
metadata_filtered = metadata_filtered.applymap(clean_cell)

# Save the cleaned metadata to a new CSV file
output_file = "GSE44076_cleaned_metadata.csv"
metadata_filtered.to_csv(output_file, index=False)

print(f"Cleaned metadata saved to {output_file}")


Cleaned metadata saved to GSE44076_cleaned_metadata.csv


  metadata_filtered = metadata_filtered.applymap(lambda x: str(x).replace('"', '') if isinstance(x, str) else x)
  metadata_filtered = metadata_filtered.applymap(clean_cell)


In [5]:
metadata_filtered.head()

Unnamed: 0,geo_accession,sample_type,individual_id,stage,location,gender,age,status
0,GSM1077598,Mucosa,A2119,--,Left,Male,62,Public on Mar 14 2014
1,GSM1077599,Mucosa,A2142,--,Left,Female,77,Public on Mar 14 2014
2,GSM1077600,Mucosa,B2104,--,Left,Female,78,Public on Mar 14 2014
3,GSM1077601,Mucosa,B2127,--,Right,Male,65,Public on Mar 14 2014
4,GSM1077602,Mucosa,B2150,--,Right,Female,52,Public on Mar 14 2014


In [6]:
metadata_filtered.tail()

Unnamed: 0,geo_accession,sample_type,individual_id,stage,location,gender,age,status
241,GSM1077839,Tumor,Y2099,IIA,Right,Male,72,Public on Mar 14 2014
242,GSM1077840,Tumor,Z2015,IIA,Right,Male,84,Public on Mar 14 2014
243,GSM1077841,Tumor,Z2038,IIA,Right,Female,65,Public on Mar 14 2014
244,GSM1077842,Tumor,Z2061,IIA,Left,Male,53,Public on Mar 14 2014
245,GSM1077843,Tumor,Z2084,IIA,Left,Male,81,Public on Mar 14 2014


In [9]:
metadata_filtered.describe()

Unnamed: 0,geo_accession,sample_type,individual_id,stage,location,gender,age,status
count,246,246,246,246,246,246,246,246
unique,246,3,148,3,2,2,45,1
top,GSM1077598,Normal,F2100,--,Left,Male,79,Public on Mar 14 2014
freq,1,98,2,148,143,169,16,246


In [10]:
metadata_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   geo_accession  246 non-null    object
 1   sample_type    246 non-null    object
 2   individual_id  246 non-null    object
 3   stage          246 non-null    object
 4   location       246 non-null    object
 5   gender         246 non-null    object
 6   age            246 non-null    object
 7   status         246 non-null    object
dtypes: object(8)
memory usage: 15.5+ KB


In [11]:
metadata_filtered.shape

(246, 8)