In [11]:
import pandas as pd
import numpy as np

# Define file path
input_file = "/Users/carmenshero/Desktop/Datasets/initialSet.csv"

# Load full dataset (including motifs)
df = pd.read_csv(input_file, dtype=str, low_memory=False)

# Convert numeric columns to proper dtypes after initial load
df = df.apply(pd.to_numeric, errors='ignore')
print(f" Loaded dataset with {len(df)} rows.")

 Loaded dataset with 85 rows.


  df = df.apply(pd.to_numeric, errors='ignore')


In [12]:
# Filter out any rows that have missing data
df_filtered = df[df.iloc[:, :23].notnull().all(axis=1)].copy()

print(f" Filtered down to {len(df_filtered)} complete rows.")

 Filtered down to 85 complete rows.


In [13]:
# Standardizing the numeric features
from sklearn.preprocessing import StandardScaler
# Columns C–N: index 2–13
# Columns Q–W: index 16–22
standardize_cols = list(df_filtered.columns[2:14]) + list(df_filtered.columns[16:23])

scaler = StandardScaler()
df_filtered[standardize_cols] = scaler.fit_transform(df_filtered[standardize_cols].astype(float))

print(f" Standardized columns C–N and Q–W (excluding ec_numbers & sequence).")

 Standardized columns C–N and Q–W (excluding ec_numbers & sequence).


In [14]:
#PCA'ing the motif columns

from sklearn.decomposition import PCA

# Identify motif columns (between W and Processed_Motifs)
motif_cols = df_filtered.columns[23:-1]  # Assumes column 23 starts motifs, last col is Processed_Motifs
motif_data = df_filtered[motif_cols].apply(pd.to_numeric, errors="coerce").fillna(0)

# Perform PCA (Feel free to change this to as many or as little columns as you want)
pca = PCA(n_components=10)
motif_pca = pca.fit_transform(motif_data)

# Create a DataFrame for PCA results and merge it
pca_cols = [f"PC{i}" for i in range(1, 11)]
pca_df = pd.DataFrame(motif_pca, columns=pca_cols, index=df_filtered.index)

# Drop existing PCA columns if they already exist (to prevent partial overwrite)
df_filtered.drop(columns=[col for col in pca_cols if col in df_filtered.columns], inplace=True)

# Append new PCA columns
df_filtered = pd.concat([df_filtered, pca_df], axis=1)

print("PCA complete — 10 PC columns added.")

PCA complete — 10 PC columns added.


In [15]:
# Dropping the motif columns and Processed_Motifs + sequence columns before saving
# Them to Training and Prediction files (ec_numbers == MISSING goes to Prediction.csv)
# Additionally moves ec_numbers to column C for easier model training / target feature choosing

# Define important columns to retain
pca_cols = [f"PC{i}" for i in range(1, 11)]
core_cols = ["PDB_ID", "UniProt_ID", "ec_numbers"]  # We'll reorder to place ec_numbers third
feature_cols = [col for col in df_filtered.columns[2:23] if col != "sequence"]
columns_to_keep = core_cols + feature_cols + pca_cols

# Drop unwanted columns: motifs, sequence, Processed_Motifs, extras
columns_to_drop = [col for col in df_filtered.columns if col not in columns_to_keep]

# Create training and prediction DataFrames
df_training = df_filtered[df_filtered["ec_numbers"] != "MISSING"].copy()
df_prediction = df_filtered[df_filtered["ec_numbers"] == "MISSING"].copy()

# Drop unneeded columns
df_training.drop(columns=columns_to_drop, inplace=True, errors="ignore")
df_prediction.drop(columns=columns_to_drop, inplace=True, errors="ignore")

# Reorder columns: ec_numbers as column C (index 2)
ordered_columns = ["PDB_ID", "UniProt_ID", "ec_numbers"] + [col for col in df_training.columns if col not in ["PDB_ID", "UniProt_ID", "ec_numbers"]]
df_training = df_training[ordered_columns]
df_prediction = df_prediction[ordered_columns]

# Save to CSV
training_file = "/Users/carmenshero/Desktop/Datasets/Training.csv"
prediction_file = "/Users/carmenshero/Desktop/Datasets/Prediction.csv"

df_training.to_csv(training_file, index=False)
df_prediction.to_csv(prediction_file, index=False)

print(f"💾 Saved {len(df_training)} rows to Training.csv")
print(f"💾 Saved {len(df_prediction)} rows to Prediction.csv")


💾 Saved 43 rows to Training.csv
💾 Saved 42 rows to Prediction.csv
