In [5]:
import os
import pandas as pd
import pickle

# Define serialization functions
def pickle_serialize_object(filename, obj):
    with open(filename, 'wb') as f:
        pickle.dump(obj, f)

def pickle_deserialize_object(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

# Step 1: Load the CSV files
data_path = '../data/raw/data.csv'
labels_path = '../data/raw/labels.csv'

data = pd.read_csv(data_path)
label = pd.read_csv(labels_path)

# Step 2: Prepare data for serialization
X = data.drop(columns=['Unnamed: 0'])  # Drop the 'Unnamed: 0' column if it's not needed

# Drop the 'Unnamed: 0' column from label if present
if 'Unnamed: 0' in label.columns:
    label = label.drop(columns=['Unnamed: 0'])

y = label['Class']  # Use the 'Class' column as the target

# Step 3: Inspect the data
print("Data information:")
print(data.info())
print("\nLabel information:")
print(label.info())

# Serialize the features and target
processed_data_dir = '../data/processed/'

if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

pickle_serialize_object(os.path.join(processed_data_dir, 'rna_seq_X.pkl'), X)
pickle_serialize_object(os.path.join(processed_data_dir, 'rna_seq_y.pkl'), y)

# Step 4: Deserialize the data
X = pickle_deserialize_object(os.path.join(processed_data_dir, 'rna_seq_X.pkl'))
y = pickle_deserialize_object(os.path.join(processed_data_dir, 'rna_seq_y.pkl'))

# Verify the deserialized data
print("\nDeserialized X:")
print(X.info())

print("\nDeserialized y:")
print(y.head())

Data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Columns: 20532 entries, Unnamed: 0 to gene_20530
dtypes: float64(20531), object(1)
memory usage: 125.5+ MB
None

Label information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Class   801 non-null    object
dtypes: object(1)
memory usage: 6.4+ KB
None

Deserialized X:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Columns: 20531 entries, gene_0 to gene_20530
dtypes: float64(20531)
memory usage: 125.5 MB
None

Deserialized y:
0    PRAD
1    LUAD
2    PRAD
3    PRAD
4    BRCA
Name: Class, dtype: object
