## Load the dataset

In [16]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


a1_synthetic_path = '../Data/A1-synthetic.txt'
a1_synthetic_data = pd.read_csv(a1_synthetic_path, sep="\t") 

# display the first few rows of the dataset
print("First few rows of the dataset:")
display(a1_synthetic_data.head())

# display dataset shape and column names
print(f"Dataset shape: {a1_synthetic_data.shape}")
print(f"Column names: {a1_synthetic_data.columns.tolist()}")



First few rows of the dataset:


Unnamed: 0,#v1,v2,v3,v4,v5,v6,v7,v8,v9,z
0,37.34411,10.542156,0.969185,3.568534,96.798733,3.429026,75.810196,0,20.002459,11.805369
1,4.089849,11.894301,0.467775,1.279044,100.149383,3.190073,76.423095,0,12.702628,5.125025
2,-32.333439,10.968631,0.238486,1.410745,100.642075,3.093934,78.758727,1,10.723848,3.218553
3,-45.632977,11.509606,0.924938,3.404069,105.963016,2.884269,83.02775,0,19.946593,12.955092
4,-41.543394,10.117186,0.31518,1.02012,97.371423,2.81582,77.194463,0,11.105024,1.919094


Dataset shape: (1000, 10)
Column names: ['#v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'z']


## Normalize the data

In [17]:
# initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# separate features and target variable
X = a1_synthetic_data.drop('z', axis=1) 
y = a1_synthetic_data['z']

# normalize the features
X_normalized = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)
print("Features normalized successfully.")

# display the first few rows of the normalized features
print("First few rows of normalized features:")
display(X_normalized.head())

# describe the normalized data to show the new range of values
print("Statistics of the normalized features:")
display(X_normalized.describe())





Features normalized successfully.
First few rows of normalized features:


Unnamed: 0,#v1,v2,v3,v4,v5,v6,v7,v8,v9
0,0.874335,0.27115,0.969445,0.856127,0.374743,0.894346,0.414925,0.0,0.829772
1,0.541234,0.947532,0.467824,0.092297,0.482507,0.747468,0.434109,0.0,0.218086
2,0.17639,0.484485,0.238439,0.136236,0.498353,0.688374,0.507217,1.0,0.052275
3,0.043172,0.755097,0.92518,0.801257,0.669486,0.559499,0.640841,0.0,0.825091
4,0.084136,0.058568,0.315165,0.005914,0.393162,0.517425,0.458254,0.0,0.084215


Statistics of the normalized features:


Unnamed: 0,#v1,v2,v3,v4,v5,v6,v7,v8,v9
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.509967,0.502863,0.511535,0.472011,0.481202,0.462648,0.488116,0.313,0.315763
std,0.284657,0.292178,0.283411,0.382762,0.157389,0.169153,0.157022,0.463946,0.272606
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.260483,0.255133,0.267556,0.085399,0.379107,0.347958,0.383749,0.0,0.063845
50%,0.52884,0.495642,0.515506,0.680507,0.475146,0.452818,0.486644,0.0,0.250535
75%,0.755635,0.75908,0.757586,0.832666,0.589703,0.570753,0.596294,1.0,0.525065
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Save the normalized data

In [18]:
# concatenate the normalized features with the target variable for saving
normalized_data = pd.concat([X_normalized, y], axis=1)

# save the normalized dataset to a new CSV file
normalized_data.to_csv('A1-synthetic-normalized.csv', index=False)
print("Normalized dataset saved as 'A1-synthetic-normalized.csv'.")


Normalized dataset saved as 'A1-synthetic-normalized.csv'.
