In [20]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold


Preprocessing

In [72]:
data = pd.read_csv('data/filtered_results.tsv', sep='\t', low_memory=False)

In [73]:
data.shape

(40921, 707)

In [74]:
data.columns

Index(['rowid', 'ccms_row_id', 'Peptidoform', 'Peptidoform ID',
       'Unmod peptidoform', 'Total', 'Total- Unmodified sequence',
       'Peptidoforms- Unmodified sequence', 'Proteins', 'Mass',
       ...
       '_dyn_#Patient_M2_healthyMale.Timepoint_2_unmod',
       '_dyn_#Patient_M2_healthyMale.Timepoint_3',
       '_dyn_#Patient_M2_healthyMale.Timepoint_3_unmod',
       '_dyn_#Patient_M3_healthyMale.Timepoint_1',
       '_dyn_#Patient_M3_healthyMale.Timepoint_1_unmod',
       '_dyn_#Patient_M3_healthyMale.Timepoint_2',
       '_dyn_#Patient_M3_healthyMale.Timepoint_2_unmod',
       '_dyn_#Patient_M3_healthyMale.Timepoint_3',
       '_dyn_#Patient_M3_healthyMale.Timepoint_3_unmod', 'id'],
      dtype='object', length=707)

Cleaning

In [75]:
unmod_columns = [col for col in data.columns if '_unmod' in col]
data = data.drop(columns=unmod_columns)
data.shape

(40921, 371)

In [None]:
def rename_column(col_name):
    # Match the pattern '_dyn_#Patient_XX.Timepoint_Y'
    match = re.match(r'_dyn_#Patient_(\S+)\.Timepoint_(\d+)', col_name)
    if match:
        patient_id = match.group(1)
        timepoint = match.group(2)
        return f"Patient_{patient_id}_Timepoint_{timepoint}"
    return col_name

data.columns = [rename_column(col) for col in data.columns]

print(data.columns)

Index(['rowid', 'ccms_row_id', 'Peptidoform', 'Peptidoform ID',
       'Unmod peptidoform', 'Total', 'Total- Unmodified sequence',
       'Peptidoforms- Unmodified sequence', 'Proteins', 'Mass',
       ...
       'Patient_M1_healthyMale_Timepoint_1',
       'Patient_M1_healthyMale_Timepoint_2',
       'Patient_M1_healthyMale_Timepoint_3',
       'Patient_M2_healthyMale_Timepoint_1',
       'Patient_M2_healthyMale_Timepoint_2',
       'Patient_M2_healthyMale_Timepoint_3',
       'Patient_M3_healthyMale_Timepoint_1',
       'Patient_M3_healthyMale_Timepoint_2',
       'Patient_M3_healthyMale_Timepoint_3', 'id'],
      dtype='object', length=371)


In [77]:
pattern = r'Patient_(\S+)_Timepoint_(\d+)'
patient_timepoints = []
for col in data.columns:
	match = re.search(pattern, col)
	if match:
		patient_timepoints.append((match.group(1), match.group(2)))
pt_timepoints = pd.DataFrame(patient_timepoints, columns=['patient_id', 'timepoint'])

In [None]:
intensity_cols = [col for col in data.columns if re.search(pattern, col)]

missing_percentages = (data[intensity_cols].isna().sum() / len(data)) * 100

missing_summary = {
    'min': missing_percentages.min(),
    'max': missing_percentages.max(),
    'mean': missing_percentages.mean(),
    'median': missing_percentages.median()
}

print("\nMissing value percentages in intensity columns:")
for stat, value in missing_summary.items():
    print(f"{stat}: {value:.2f}%")


Missing value percentages in intensity columns:
min: 90.85%
max: 99.95%
mean: 92.25%
median: 91.82%


In [86]:
intensity_cols[:4]
non_intensity_cols = [col for col in data.columns if col not in intensity_cols]
non_intensity_cols[:4]

['rowid', 'ccms_row_id', 'Peptidoform', 'Peptidoform ID']

Fix numerical values

In [79]:
data[intensity_cols] = data[intensity_cols].replace({',': ''}, regex=True).apply(pd.to_numeric)

Which patients contribute to the missing intensity?

In [80]:
patient_missing_data = data[intensity_cols].isna().sum(axis=1)
patients_with_missing_data = patient_missing_data[patient_missing_data > 5]
patients_with_missing_data.value_counts()

336    18922
335     3151
329     2260
334     1919
333     1164
       ...  
69         5
113        5
141        4
86         4
78         3
Name: count, Length: 328, dtype: int64

Imputing missing values with 0 (values below detection limit in mass spectrometry),

In [82]:
data[intensity_cols] = data[intensity_cols].fillna(0)
data.shape

(40921, 371)

What features can we remove?
- Low variance? Not really informative. (Nothing is low imo, but relatively we can remove 1e15?) 

In [83]:
feature_variances = data[intensity_cols].var()

variance_summary = {
    'min': feature_variances.min(),
    'max': feature_variances.max(),
    'mean': feature_variances.mean(),
    'median': feature_variances.median(),
    'std': feature_variances.std()
}

variance_summary

{'min': np.float64(281812742380039.2),
 'max': np.float64(1.3564992257466317e+17),
 'mean': np.float64(4.1469822764016424e+16),
 'median': np.float64(3.90418979479439e+16),
 'std': np.float64(2.0007461603153216e+16)}

In [95]:
var_threshold = VarianceThreshold(threshold=1e15)
data_f_intensity = var_threshold.fit_transform(data[intensity_cols])
selected_intensity_cols = [col for col, keep in zip(intensity_cols, var_threshold.get_support()) if keep]
data_f_intensity = pd.DataFrame(data_f_intensity, columns=selected_intensity_cols)
data_f = pd.DataFrame(np.hstack([data_f_intensity, data[non_intensity_cols].values]),
                               columns=selected_intensity_cols + non_intensity_cols)

In [91]:
num_features_before = len(intensity_cols) + len(non_intensity_cols)
num_features_after = data_f.shape[1]
num_features_dropped = num_features_before - num_features_after

print(f"Number of features before: {num_features_before}")
print(f"Number of features after: {num_features_after}")
print(f"Number of features dropped: {num_features_dropped}")

Number of features before: 371
Number of features after: 366
Number of features dropped: 5


In [None]:
# data = data_f # Use data instead of data_f generally, not doing rn for testing

In [96]:
data_f_intensity.shape

(40921, 331)

High correlation across the same patient, (for intensities?)

In [None]:
patient_data_corr = data_f_intensity.corr()

# Highly correlated pairs
high_corr_pairs = set()
for i in range(len(patient_data_corr.columns)):
    for j in range(i):
        if abs(patient_data_corr.iloc[i, j]) > 0.92:
            high_corr_pairs.add((patient_data_corr.columns[i], patient_data_corr.columns[j]))
len(high_corr_pairs)

9

In [100]:
features_to_drop = set()
for pair in high_corr_pairs:
    features_to_drop.add(pair[1])  # Dropping second

data_f2_intensity = data_f_intensity.drop(columns=features_to_drop)

num_features_before = data_f_intensity.shape[1]
num_features_after = data_f2_intensity.shape[1]
num_features_dropped = num_features_before - num_features_after

print(f"Number of features dropped: {num_features_dropped}")
print(f"Number of features remaining: {num_features_after}")

Number of features dropped: 9
Number of features remaining: 322


In [103]:
data_f = data_f.drop(columns=features_to_drop)
data_f.shape

(40921, 357)