# Data Normalization

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('cleaned_data.csv')
dataset.head()

In [3]:
# Get summary statistics for whole data set
columns_to_normalize = list(range(4, dataset.shape[1]))
max_values = dataset.iloc[:,columns_to_normalize].max()
min_values = dataset.iloc[:,columns_to_normalize].min()
med_values = dataset.iloc[:,columns_to_normalize].median()

In [4]:
print(max_values)

In [5]:
print(min_values)

In [6]:
print(med_values)

In [7]:
# Normalize dataset
def normalize(df, columns, min_vals, max_vals):
    """
    Normalize given columns in the given DataFrame using the provided dequences of minimum and maximum values.

    X_normalized = (X - X_min) / (X_max - X_min)

    :param df: A pandas DataFrame to normalize.
    :param columns: A list of column indices to normalize.
    :param min_vals: A pandas Series of minimum values for each column.
    :param max_vals: A pandas Series of maximum values for each column.
    :return: A normalized pandas DataFrame.
    """
    # Create a copy of the DataFrame
    df_normalized = df.copy()

    # Normalize the specified columns
    for col in columns:
        # if col == 296:
        #     break
        col_range = max_vals[col-4] - min_vals[col-4]
        # Normalize the current column using scaling formula
        df_normalized.iloc[:, col] = (df.iloc[:, col] - min_vals[col-4]) / col_range
        # print(df_normalized.iloc[:, col])

    return df_normalized

In [8]:
dataset_normalized = normalize(dataset, columns_to_normalize, min_values, max_values)

In [9]:
# Export cleaned data to a new CSV file
dataset_normalized.to_csv('normalized_data.csv', index=False)
dataset_normalized