In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

In [2]:
# Step 1 - load the data and explore
data3 = pd.read_csv("../raw-data/yeast_data.csv")

# drop the first column, which only serves as a row index
data3 = data3.iloc[:, 1:]

# drop amino acid identity one-hot encoding columns (first 180 after dropping the index column)
data3 = data3.iloc[:, 180:]

# set the random seed
random_seed = 1

# separate the majority and minority classes
majority = data3[data3['target_value'] == 0]
minority = data3[data3['target_value'] == 1]

# downsample the majority class
majority_downsampled = resample(
    majority,
    replace=False,            
    n_samples=7500,           
    random_state=random_seed
)

# downsample the majority class
minority_downsampled = resample(
    minority,
    replace=False,            
    n_samples=7500,           
    random_state=random_seed
)

# combine the minority class with the downsampled majority class
balanced = pd.concat([majority_downsampled, minority_downsampled])

# shuffle the data
balanced = balanced.sample(frac=1, random_state=random_seed).reset_index(drop=True)

print(balanced['target_value'].value_counts())

balanced.describe()

target_value
1    7500
0    7500
Name: count, dtype: int64


Unnamed: 0,A_-4_pssm,R_-4_pssm,N_-4_pssm,D_-4_pssm,C_-4_pssm,Q_-4_pssm,E_-4_pssm,G_-4_pssm,H_-4_pssm,I_-4_pssm,...,KARS160118_aaindex,KARS160119_aaindex,KARS160120_aaindex,KARS160121_aaindex,KARS160122_aaindex,str__AlphaHelix,str__Coil,str__Strand,str__Turn,target_value
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,...,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,-0.009423,-0.000123,0.014152,-0.008416,-0.002473,-0.010411,-0.027207,0.034204,-0.006342,-0.029456,...,-0.057133,-0.067266,-0.04741,-0.056217,0.063451,-0.036926,0.016695,0.029942,0.016429,0.5
std,0.993031,1.007926,1.009345,0.985395,1.006521,1.006512,0.970355,1.04293,0.995706,1.002494,...,1.023094,1.010846,1.019811,1.019033,1.006637,1.02416,1.261314,1.026263,1.032122,0.500017
min,-3.702108,-2.859091,-3.347503,-2.806862,-2.932563,-2.958117,-2.867022,-2.511551,-2.623656,-2.566789,...,-4.417969,-4.630692,-5.85457,-4.591849,-2.790285,-1.888825,-0.028222,-0.444558,-0.236844,0.0
25%,-0.745324,-0.760338,-0.816681,-0.636419,-0.437336,-0.613271,-0.831815,-0.519308,-0.753772,-0.661201,...,-0.709607,-0.682777,-0.655088,-0.682178,-0.658834,0.52943,-0.028222,-0.444558,-0.236844,0.0
50%,-0.322926,-0.410545,-0.093589,-0.326355,-0.021465,-0.222463,-0.153413,-0.187267,-0.005819,-0.343603,...,-0.015409,0.028585,0.124381,0.03631,-0.024865,0.52943,-0.028222,-0.444558,-0.236844,0.5
75%,0.521869,0.289039,0.629503,0.603835,0.394407,0.559152,0.52499,0.476814,0.368158,0.609192,...,0.606765,0.685694,0.713759,0.680645,0.704375,0.52943,-0.028222,-0.444558,-0.236844,1.0
max,3.478653,3.786962,3.883417,3.394404,6.216604,4.07642,3.238599,3.465178,4.85588,3.149976,...,4.177504,2.337356,1.450158,3.436936,5.218115,0.52943,35.432936,2.249425,4.222185,1.0


In [3]:
# Separate the feature columns and the target column
features = balanced.drop(columns=['target_value'])
outcome  = balanced['target_value']

# Initialize the scaler
scaler = StandardScaler()

# Scale the feature columns
scaled_features = scaler.fit_transform(features)

# Create a new DataFrame with scaled features and the target column
scaled_ = pd.DataFrame(scaled_features, columns=features.columns)
scaled_['target_value'] = outcome.reset_index(drop=True)

# Print to verify
print(scaled_.head())

scaled_.describe()

   A_-4_pssm  R_-4_pssm  N_-4_pssm  D_-4_pssm  C_-4_pssm  Q_-4_pssm  \
0  -1.166467   3.063321  -0.464957  -0.637332  -1.258442   0.177605   
1  -0.315714  -0.407209   0.251464   2.509363   0.807513   0.177605   
2  -1.166467  -1.448368  -1.539589  -1.581340  -0.432060  -1.375564   
3   0.960415   0.633950   1.326096   0.306677  -0.018869   0.954189   
4  -0.741091  -0.407209  -0.464957  -0.637332  -0.432060  -0.598979   

   E_-4_pssm  G_-4_pssm  H_-4_pssm  I_-4_pssm  ...  KARS160118_aaindex  \
0  -0.130065  -0.530745  -0.375076  -0.947012  ...           -0.183969   
1   1.967388   0.106022  -0.375076   0.003443  ...           -0.346913   
2  -1.528368  -1.485896  -1.501882   1.904353  ...            0.328332   
3   0.569086   2.016322   0.000526  -0.630194  ...            0.681985   
4  -0.479641  -0.530745   1.127332  -0.313376  ...            0.371299   

   KARS160119_aaindex  KARS160120_aaindex  KARS160121_aaindex  \
0           -0.513191            0.575881           -0.238002  

Unnamed: 0,A_-4_pssm,R_-4_pssm,N_-4_pssm,D_-4_pssm,C_-4_pssm,Q_-4_pssm,E_-4_pssm,G_-4_pssm,H_-4_pssm,I_-4_pssm,...,KARS160118_aaindex,KARS160119_aaindex,KARS160120_aaindex,KARS160121_aaindex,KARS160122_aaindex,str__AlphaHelix,str__Coil,str__Strand,str__Turn,target_value
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,...,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,6.6317320000000005e-18,-9.473902999999998e-19,-1.0658140000000001e-17,-4.3579950000000007e-17,-3.647453e-17,-1.2316070000000001e-17,-4.310626e-17,-1.894781e-18,-1.894781e-18,-6.6317320000000005e-18,...,-3.789561e-18,-2.569796e-17,7.105427e-18,1.539509e-18,-2.2737370000000003e-17,-1.117921e-16,-1.9895200000000002e-17,7.034373000000001e-17,6.347515000000001e-17,0.5
std,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,...,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,0.500017
min,-3.718725,-2.83658,-3.330643,-2.840018,-2.911205,-2.928732,-2.92667,-2.441046,-2.628689,-2.531105,...,-4.262544,-4.514611,-5.694541,-4.451064,-2.835015,-1.808274,-0.03561282,-0.4623728,-0.2453987,0.0
25%,-0.7410907,-0.7542619,-0.8231681,-0.6373316,-0.4320599,-0.5989794,-0.8292166,-0.5307452,-0.7506783,-0.6301941,...,-0.6377679,-0.6089267,-0.5958938,-0.61429,-0.717547,0.5530135,-0.03561282,-0.4623728,-0.2453987,0.0
50%,-0.3157144,-0.4072089,-0.1067468,-0.3226621,-0.01886905,-0.2106873,-0.1300654,-0.2123618,0.0005258429,-0.3133757,...,0.04078276,0.09482615,0.1684588,0.09080126,-0.08773702,0.5530135,-0.03561282,-0.4623728,-0.2453987,0.5
75%,0.5350384,0.2868972,0.6096745,0.6213463,0.3943218,0.5658969,0.5690857,0.4244051,0.3761279,0.6370796,...,0.6489334,0.7449061,0.7464071,0.7231234,0.6367197,0.5530135,-0.03561282,-0.4623728,-0.2453987,1.0
max,3.512673,3.757428,3.83357,3.453372,6.178994,4.060526,3.36569,3.289856,4.883353,3.171627,...,4.139189,2.3789,1.468525,3.428023,5.120849,0.5530135,28.07977,2.162757,4.075001,1.0


In [4]:
scaled_.to_csv('../raw-data/yeast-processed_v2.csv', index=False)