# Data Normalization

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('cleaned_data.csv')
dataset.head()

Unnamed: 0,ID,Sample_ID,Participant_ID,Dataset,Disease,Age,FT1,FT2,FT3,FT4,...,HPA049320_SBA4_rep1,HPA051620_SBA4_rep1,HPA054862_SBA4_rep1,HPA003901_SBA4_rep1,HPA035863_SBA4_rep1,HPA040052_SBA4_rep1,HPA041542_SBA4_rep1,HPA044582_SBA4_rep1,HPA045702_SBA4_rep1,HPA048982_SBA4_rep1
0,6,S237,P5,DNHS,1.0,11.72,2.538071,0.628931,0.367647,488.7,...,12.543301,12.608368,,,12.598509,12.452527,12.495444,12.800951,12.696679,12.602999
1,7,S220,P5,DNHS,1.0,13.41,2.531646,0.492611,0.275482,475.0,...,12.390816,12.571472,12.389882,12.490296,12.522449,12.519723,12.413517,12.917533,12.514924,12.618601
2,9,S91,P6,DNHS,1.0,11.76,2.237136,0.346021,0.176991,449.0,...,12.464468,12.599863,12.459217,12.509688,12.482434,12.507582,12.593243,12.733109,12.554774,12.587261
3,10,S49,P134,DNHS,0.0,,,,,,...,12.560804,12.531565,12.490397,12.419143,12.585085,12.509866,12.386933,12.820761,12.532538,12.661084
4,17,S79,P134,DNHS,0.0,,,,,,...,12.583934,12.599499,12.52721,12.636333,12.440678,12.555278,12.617759,12.909895,12.578293,12.685735


In [3]:
# Get summary statistics for whole data set
columns_to_normalize = list(range(4, dataset.shape[1]))
max_values = dataset.iloc[:,columns_to_normalize].max()
min_values = dataset.iloc[:,columns_to_normalize].min()
med_values = dataset.iloc[:,columns_to_normalize].median()

In [4]:
print(max_values)

Disease                 1.000000
Age                    13.500000
FT1                     3.215434
FT2                     0.628931
FT3                     0.552486
                         ...    
HPA040052_SBA4_rep1    12.907514
HPA041542_SBA4_rep1    12.814063
HPA044582_SBA4_rep1    13.232916
HPA045702_SBA4_rep1    12.821364
HPA048982_SBA4_rep1    12.904314
Length: 972, dtype: float64


In [5]:
print(min_values)

Disease                 0.000000
Age                     4.020000
FT1                     0.000000
FT2                     0.000000
FT3                     0.000000
                         ...    
HPA040052_SBA4_rep1    12.280399
HPA041542_SBA4_rep1    12.188121
HPA044582_SBA4_rep1    12.580456
HPA045702_SBA4_rep1    12.267231
HPA048982_SBA4_rep1    12.209317
Length: 972, dtype: float64


In [6]:
print(med_values)

Disease                 1.000000
Age                     7.010000
FT1                     0.512755
FT2                     0.251256
FT3                     0.196078
                         ...    
HPA040052_SBA4_rep1    12.540655
HPA041542_SBA4_rep1    12.558999
HPA044582_SBA4_rep1    12.808425
HPA045702_SBA4_rep1    12.534300
HPA048982_SBA4_rep1    12.635727
Length: 972, dtype: float64


In [7]:
# Normalize dataset
def normalize(df, columns, min_vals, max_vals):
    """
    Normalize given columns in the given DataFrame using the provided dequences of minimum and maximum values.

    X_normalized = (X - X_min) / (X_max - X_min)

    :param df: A pandas DataFrame to normalize.
    :param columns: A list of column indices to normalize.
    :param min_vals: A pandas Series of minimum values for each column.
    :param max_vals: A pandas Series of maximum values for each column.
    :return: A normalized pandas DataFrame.
    """
    # Create a copy of the DataFrame
    df_normalized = df.copy()

    # Normalize the specified columns
    for col in columns:
        # if col == 296:
        #     break
        col_range = max_vals[col-4] - min_vals[col-4]
        # Normalize the current column using scaling formula
        df_normalized.iloc[:, col] = (df.iloc[:, col] - min_vals[col-4]) / col_range
        # print(df_normalized.iloc[:, col])

    return df_normalized

In [8]:
dataset_normalized = normalize(dataset, columns_to_normalize, min_values, max_values)

  col_range = max_vals[col-4] - min_vals[col-4]
  df_normalized.iloc[:, col] = (df.iloc[:, col] - min_vals[col-4]) / col_range


In [9]:
# Export cleaned data to a new CSV file
dataset_normalized.to_csv('normalized_data.csv', index=False)
dataset_normalized

Unnamed: 0,ID,Sample_ID,Participant_ID,Dataset,Disease,Age,FT1,FT2,FT3,FT4,...,HPA049320_SBA4_rep1,HPA051620_SBA4_rep1,HPA054862_SBA4_rep1,HPA003901_SBA4_rep1,HPA035863_SBA4_rep1,HPA040052_SBA4_rep1,HPA041542_SBA4_rep1,HPA044582_SBA4_rep1,HPA045702_SBA4_rep1,HPA048982_SBA4_rep1
0,6,S237,P5,DNHS,1.0,0.812236,0.789340,1.000000,0.665441,0.516760,...,0.652301,0.415807,,,0.462314,0.274476,0.490977,0.337944,0.774992,0.566452
1,7,S220,P5,DNHS,1.0,0.990506,0.787342,0.783251,0.498623,0.502273,...,0.354054,0.364635,0.431287,0.472856,0.312034,0.381628,0.360091,0.516626,0.446992,0.588900
2,9,S91,P6,DNHS,1.0,0.816456,0.695749,0.550173,0.320354,0.474781,...,0.498109,0.404011,0.531496,0.494356,0.232971,0.362268,0.647220,0.233966,0.518906,0.543806
3,10,S49,P134,DNHS,0.0,,,,,,...,0.686534,0.309285,0.576559,0.393971,0.435790,0.365910,0.317621,0.368307,0.478779,0.650028
4,17,S79,P134,DNHS,0.0,,,,,,...,0.731774,0.403507,0.629765,0.634763,0.150468,0.438323,0.686387,0.504920,0.561350,0.685496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,380,S306,P29,FORDMD,1.0,0.338389,0.086389,,0.624138,0.555144,...,,0.241090,,0.491827,0.195225,,,0.526869,0.572850,
297,381,S300,P29,FORDMD,1.0,0.443513,0.081842,,0.670370,0.404991,...,0.464478,0.364822,,,0.564109,0.134264,0.573000,0.304281,,
298,382,S174,P30,FORDMD,1.0,0.396438,0.042027,,0.251389,0.370096,...,0.750643,0.227890,0.569516,0.569107,0.347931,0.299770,0.595741,0.278654,0.406736,0.635230
299,383,S94,P30,FORDMD,1.0,0.503584,0.037927,,0.238158,0.352120,...,0.939856,0.359113,0.435914,0.600841,0.104144,0.486267,0.701801,0.495797,0.581823,0.660074
