# 5. Imputation of the missing values

Values have been 
1. filtered out for > 30% missing in each sample
2. VSN normalization- has been applied across the samples to account for sample specific variance
3. Results of the normalization have been validated and summary stastics seem to suggest the normalization was infact sucessful

Now: 
1. Impute with finetuned RandomForest algorithm
2. Check the EDA (MA, PCA)
3. Merge the 2 x 2 dataframes
4. Find significance and t test
5. Correct for FDR
6. Report the significant proteins

In [1]:
import pandas as pd
import os
import yaml
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ParameterGrid

In [12]:
# Load configuration
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Accessing config values
normalized_dir = config['datasets']['normalized_dir']
imputed_dir = config['datasets']['Imputed_dir']
# Ensure the imputed directory exists
if not os.path.exists(imputed_dir):
    os.makedirs(imputed_dir)

# Construct the file paths
asthma_after = os.path.join(normalized_dir, 'df_after_asthma_VSN.csv')
asthma_before = os.path.join(normalized_dir, 'df_before_asthma_VSN.csv')
control_after = os.path.join(normalized_dir, 'df_after_control_VSN.csv')
control_before = os.path.join(normalized_dir, 'df_before_control_VSN.csv')

# Construct output paths
asthma_after_imputed = os.path.join(imputed_dir, 'asthma_after_imputed.csv')
asthma_before_imputed = os.path.join(imputed_dir, 'asthma_before_imputed.csv')
control_after_imputed = os.path.join(imputed_dir, 'control_after_imputed.csv')
control_before_imputed = os.path.join(imputed_dir, 'control_before_imputed.csv')

# Load the dataframes
df_asthma_after = pd.read_csv(asthma_after)
df_asthma_before = pd.read_csv(asthma_before)
df_control_after = pd.read_csv(control_after)
df_control_before = pd.read_csv(control_before)

# List of dataframes and their corresponding output paths
df_list = [
    (df_asthma_after, asthma_after_imputed),
    (df_asthma_before, asthma_before_imputed),
    (df_control_after, control_after_imputed),
    (df_control_before, control_before_imputed)
]


In [13]:
asthma_after_imputed

'./3- Imputed/ /asthma_after_imputed.csv'

In [14]:
'''
# testing function
def replace_values_with_nan(df, columns_to_change, rows_to_change):
    df_copy = df.copy()
    
    for col_index, row_index in zip(columns_to_change, rows_to_change):
        df_copy.iloc[row_index, col_index] = np.nan

    return df_copy

# Parameters to be tested
params_grid = ParameterGrid({
    'n_estimators': [5, 10],
    'max_iter': [5, 10],
    'random_state': [25, 50]
})

'''

"\n# testing function\ndef replace_values_with_nan(df, columns_to_change, rows_to_change):\n    df_copy = df.copy()\n    \n    for col_index, row_index in zip(columns_to_change, rows_to_change):\n        df_copy.iloc[row_index, col_index] = np.nan\n\n    return df_copy\n\n# Parameters to be tested\nparams_grid = ParameterGrid({\n    'n_estimators': [5, 10],\n    'max_iter': [5, 10],\n    'random_state': [25, 50]\n})\n\n"

Before implementing this model for imputation i have already cheked and finetuned model parameters on its performance as MAE scores on training data. 

In [15]:
def impute_missing_values(df, output):
    # Split the DataFrame into string columns (first two) and numerical columns (rest)
    string_cols = df.iloc[:, :1]
    num_cols = df.iloc[:, 1:]

    # Initialize the IterativeImputer with Random Forest
    imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=15), max_iter=5, random_state=42)

    # Impute the missing values in the numerical columns
    imputed_num_cols = imputer.fit_transform(num_cols)

    # Convert the imputed numerical columns back to a DataFrame
    imputed_num_cols_df = pd.DataFrame(imputed_num_cols, columns=num_cols.columns, index=num_cols.index)

    # Combine the string columns and imputed numerical columns
    imputed_df = pd.concat([string_cols, imputed_num_cols_df], axis=1)
    
    # Save the summary statistics to a CSV file for reporting
    imputed_df.to_csv(output, index=False)
    return imputed_df

for df, output in df_list:
    impute_missing_values(df, output)




Randmforest regressor <strong> has been fine-tuned </strong>now according to model returning lowest MSE on training data. I did the check
1. n_estimators=5, 10, 15
2. max_iter=5, 10, 15
3. random_state=25, 42
15
5



In [19]:
imputed.to_csv('Control_before_VSN_imp.csv')

In [12]:
df_asthma_after

Unnamed: 0.1,Unnamed: 0,Protiens,F5..Sample.A..asthma,F6..Sample.A..asthma,F8..Sample.A..asthma,F10..Sample.A..asthma,F18..Sample.A..asthma,F19..Sample.A..asthma,F20..Sample.A..asthma,F21..Sample.A..asthma,...,F60..Sample.A..asthma,F82..Sample.A..asthma,F88..Sample.A..asthma,F94..Sample.A..asthma,F95..Sample.A..asthma,F96..Sample.A..asthma,F97..Sample.A..asthma,F98..Sample.A..asthma,F99..Sample.A..asthma,F100..Sample.A..asthma
0,0,albumin,11.867328,13.983982,8.51687,13.576901,12.709125,13.407292,13.166174,11.937969,...,10.427792,15.416472,13.261666,10.327066,12.057381,10.384369,11.065673,11.580825,12.874299,12.355676
1,1,surf A2,7.778194,12.922241,,11.500214,10.962273,9.68069,11.503674,9.140556,...,6.857821,13.213581,11.556957,8.073051,10.005652,,10.16998,9.70514,10.897783,10.621589
2,2,DENN,16.415786,15.304577,14.974684,14.02947,12.641401,13.421425,10.200717,12.500683,...,14.044141,12.879985,13.723785,,,10.469186,10.61844,11.253304,,14.420763
3,3,Keratin 1,16.464254,16.870155,17.788796,16.887898,16.305967,14.896305,15.579106,17.16271,...,16.593609,17.261702,14.903752,18.059772,17.064841,17.698952,16.426762,16.751745,16.22735,17.622387
4,4,keratin2,16.725344,16.893565,17.74341,17.475137,16.699391,15.347409,15.79644,17.504678,...,17.058318,17.089249,18.19238,17.598838,16.68275,17.99011,17.318088,17.418308,17.224872,17.243595
5,5,cyto 10,16.119531,16.681106,16.136367,17.44506,15.845344,15.312951,14.932569,16.908511,...,16.663684,17.023072,18.757408,15.637569,16.160547,17.309399,16.576804,17.544323,18.043554,15.401402
6,6,plastin2,12.691985,12.749539,11.826154,11.668078,12.279927,13.625693,11.683128,11.203004,...,,13.199158,11.316446,12.821226,10.99919,10.808601,12.305174,11.243163,9.813195,12.842261
7,7,trypsin1,12.319422,10.95006,10.044335,10.468294,12.381193,13.601461,12.081934,11.70653,...,11.258843,10.013726,10.799044,10.842493,9.456368,9.141754,11.969453,11.083495,9.622221,12.118483
8,8,calm5,12.674204,12.513353,11.338722,11.069031,11.461445,12.014209,10.814997,9.96659,...,10.646379,10.577566,9.463457,11.303662,10.318429,8.111479,10.651006,10.344422,7.917294,11.209066
9,9,compC3,11.869177,10.557226,11.42455,,11.468666,12.545026,10.915471,9.959225,...,9.933805,,10.71954,11.718831,9.205036,9.726975,,10.535747,8.327819,11.744453
