In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def normalize_dataframe(df, scaler=None, inverse=False, row=None):
    """
    Normalize or denormalize a pandas DataFrame using MinMaxScaler.

    Parameters:
    - df (pd.DataFrame): DataFrame to normalize or denormalize.
    - scaler (MinMaxScaler, optional): Pre-fitted MinMaxScaler object for denormalization.
    - inverse (bool): If True, performs denormalization instead.
    - row (pd.Series or np.array, optional): If provided, denormalizes a specific row instead of the entire DataFrame.

    Returns:
    - pd.DataFrame or pd.Series: Normalized or denormalized DataFrame or row.
    - MinMaxScaler: Fitted scaler (only returned during normalization).
    """
    if inverse:
        if scaler is None:
            raise ValueError("Scaler must be provided for denormalization.")

        if row is not None:
            # Convert the row to a 2D array for transformation
            row_reshaped = row.values.reshape(1, -1) if isinstance(row, pd.Series) else row.reshape(1, -1)
            denormalized_row = scaler.inverse_transform(row_reshaped)
            return pd.Series(denormalized_row.flatten(), index=df.columns)  # Convert back to Series

        # Otherwise, denormalize the entire DataFrame
        denormalized_data = scaler.inverse_transform(df)
        return pd.DataFrame(denormalized_data, columns=df.columns)

    else:
        scaler = MinMaxScaler()
        normalized_data = scaler.fit_transform(df)
        return pd.DataFrame(normalized_data, columns=df.columns), scaler


In [14]:
# Sample DataFrame
data = pd.DataFrame({
    "A": [100, 200, 300, 400, 500],
    "B": [5, 15, 25, 35, 45]
})

# Normalize Data
normalized_df, scaler = normalize_dataframe(data)

print("Normalized DataFrame:")
print(normalized_df)


Normalized DataFrame:
      A     B
0  0.00  0.00
1  0.25  0.25
2  0.50  0.50
3  0.75  0.75
4  1.00  1.00


In [15]:
# Convert normalized data back to original scale
denormalized_df = normalize_dataframe(normalized_df, scaler=scaler, inverse=True)

print("\nDenormalized DataFrame:")
print(denormalized_df.astype(int))



Denormalized DataFrame:
     A   B
0  100   5
1  200  15
2  300  25
3  400  35
4  500  45


In [16]:
def read_data(path):
    df1 = pd.read_csv(path)
    df1 = df1.drop(['start_ts', 'session_duration'], axis=1)
    df1 = df1.fillna(0)
    df1['role'] = df1['role'].astype('category').cat.codes
    df1['user'] = df1['user'].astype('category').cat.codes
    return df1

In [17]:
import os    
path = "/home/sathish/UEBA/data/data.csv"
df = read_data(path)
train_data, test_data = df.iloc[:276388], df.iloc[276388:]

In [None]:
normalized_df, scaler = normalize_dataframe(df)

print("Normalized DataFrame:")
print(normalized_df)


In [None]:
denormalized_df = normalize_dataframe(normalized_df, scaler=scaler, inverse=True)

print("\nDenormalized DataFrame:")
print(denormalized_df.astype(int))

In [21]:
# Select a single row (e.g., first row)
row_to_denormalize = normalized_df.iloc[0]
#print(row_to_denormalize)

# Denormalize the row
denormalized_row = normalize_dataframe(normalized_df, scaler=scaler, inverse=True, row=row_to_denormalize)

print("\nDenormalized Row:")
print(denormalized_row.astype(int))


Denormalized Row:
user                                    707
logon_on_own_pc_normal                    1
logon_on_other_pc_normal                  0
logon_on_own_pc_off_hour                  1
logon_on_other_pc_off_hour                0
logon_hour                                6
day_of_a_week                             5
device_connects_on_own_pc                 0
device_connects_on_other_pc               0
device_connects_on_own_pc_off_hour        0
device_connects_on_other_pc_off_hour      0
documents_copy_own_pc                     0
documents_copy_other_pc                   0
exe_files_copy_own_pc                     0
exe_files_copy_other_pc                   0
documents_copy_own_pc_off_hour            0
documents_copy_other_pc_off_hour          0
exe_files_copy_own_pc_off_hour            0
exe_files_copy_other_pc_off_hour          0
neutral_sites                            81
job_search                                0
hacking_sites                             0
neutral_sites

In [23]:
df.iloc[0].astype(int)

user                                    707
logon_on_own_pc_normal                    1
logon_on_other_pc_normal                  0
logon_on_own_pc_off_hour                  1
logon_on_other_pc_off_hour                0
logon_hour                                6
day_of_a_week                             5
device_connects_on_own_pc                 0
device_connects_on_other_pc               0
device_connects_on_own_pc_off_hour        0
device_connects_on_other_pc_off_hour      0
documents_copy_own_pc                     0
documents_copy_other_pc                   0
exe_files_copy_own_pc                     0
exe_files_copy_other_pc                   0
documents_copy_own_pc_off_hour            0
documents_copy_other_pc_off_hour          0
exe_files_copy_own_pc_off_hour            0
exe_files_copy_other_pc_off_hour          0
neutral_sites                            81
job_search                                0
hacking_sites                             0
neutral_sites_off_hour          