In [25]:
# This notebook demonstrates various data transformation techniques.
# Import pandas for data manipulation, numpy for numerical operations, and preprocessing from scikit-learn.
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [26]:
# Load the dataset for transformation.
df = pd.read_csv('datatransform.csv', header=0)
# Display the first 10 rows to inspect the data.
df.head(10)

Unnamed: 0,Col 1,col 2,col 3,col 4
0,9707,245,495276,0.019599
1,8593,204,513860,0.016722
2,9529,424,976523,0.009758
3,4553,293,150696,0.030213
4,1418,285,142106,0.009978
5,2177,562,291927,0.007457
6,6420,425,376294,0.017061
7,3428,419,41079,0.083449
8,8162,332,552404,0.014775
9,6804,64,210861,0.032268


In [27]:
# Apply min-max normalization to 'Col 1'.
# This scales the data to a fixed range, usually 0 to 1.
df['normalized column 1'] = (df['Col 1'] - df['Col 1'].min()) / (df['Col 1'].max() - df['Col 1'].min())
# Save the DataFrame with the normalized column to a new CSV file.
df.to_csv(r'dt_mixmaxed_old.csv', index=0)
# Display the first 5 rows to see the result.
df.head()

Unnamed: 0,Col 1,col 2,col 3,col 4,normalized column 1
0,9707,245,495276,0.019599,0.970188
1,8593,204,513860,0.016722,0.845802
2,9529,424,976523,0.009758,0.950313
3,4553,293,150696,0.030213,0.394707
4,1418,285,142106,0.009978,0.044663


In [28]:
# Apply standardization (Z-score normalization) to 'col 2'.
# This scales the data to have a mean of 0 and a standard deviation of 1.
df['standardized col 2'] = (df['col 2'] - df['col 2'].mean()) / df['col 2'].std()
# Save the DataFrame with the standardized column to a new CSV file.
df.to_csv(r'dt_zscore_col2.csv', index=0)

In [29]:
# Display the DataFrame to show all the transformations so far.
df

Unnamed: 0,Col 1,col 2,col 3,col 4,normalized column 1,standardized col 2
0,9707,245,495276,0.019599,0.970188,-0.408101
1,8593,204,513860,0.016722,0.845802,-0.671431
2,9529,424,976523,0.009758,0.950313,0.741558
3,4553,293,150696,0.030213,0.394707,-0.099813
4,1418,285,142106,0.009978,0.044663,-0.151194
...,...,...,...,...,...,...
204,3422,109,832699,0.004110,0.268423,-1.281585
205,5761,416,160218,0.035957,0.529589,0.690177
206,9702,399,421017,0.023044,0.969629,0.580991
207,4813,531,633224,0.007601,0.423738,1.428785


In [30]:
# Apply a natural log transformation to 'col 3'.
# This can help to reduce the skewness of the data.
df["natural_log_amount col 3"] = np.log(df["col 3"])
# Save the DataFrame with the log-transformed column.
df.to_csv(r'dt_logarithmic_column3.csv', index=0)

In [31]:
# Display the DataFrame to show the natural log transformed column.
df

Unnamed: 0,Col 1,col 2,col 3,col 4,normalized column 1,standardized col 2,natural_log_amount col 3
0,9707,245,495276,0.019599,0.970188,-0.408101,13.112870
1,8593,204,513860,0.016722,0.845802,-0.671431,13.149706
2,9529,424,976523,0.009758,0.950313,0.741558,13.791754
3,4553,293,150696,0.030213,0.394707,-0.099813,11.923020
4,1418,285,142106,0.009978,0.044663,-0.151194,11.864329
...,...,...,...,...,...,...,...
204,3422,109,832699,0.004110,0.268423,-1.281585,13.632428
205,5761,416,160218,0.035957,0.529589,0.690177,11.984291
206,9702,399,421017,0.023044,0.969629,0.580991,12.950428
207,4813,531,633224,0.007601,0.423738,1.428785,13.358580


In [32]:
# Apply a base-10 log transformation to 'col 3'.
df["log10_amount col 3"] = np.log10(df["col 3"])
# Save the DataFrame with the new log-transformed column.
df.to_csv(r'dt_log10_col3.csv', index=0)

In [33]:
# Display the DataFrame to see both log-transformed columns.
df

Unnamed: 0,Col 1,col 2,col 3,col 4,normalized column 1,standardized col 2,natural_log_amount col 3,log10_amount col 3
0,9707,245,495276,0.019599,0.970188,-0.408101,13.112870,5.694847
1,8593,204,513860,0.016722,0.845802,-0.671431,13.149706,5.710845
2,9529,424,976523,0.009758,0.950313,0.741558,13.791754,5.989682
3,4553,293,150696,0.030213,0.394707,-0.099813,11.923020,5.178102
4,1418,285,142106,0.009978,0.044663,-0.151194,11.864329,5.152612
...,...,...,...,...,...,...,...,...
204,3422,109,832699,0.004110,0.268423,-1.281585,13.632428,5.920488
205,5761,416,160218,0.035957,0.529589,0.690177,11.984291,5.204711
206,9702,399,421017,0.023044,0.969629,0.580991,12.950428,5.624300
207,4813,531,633224,0.007601,0.423738,1.428785,13.358580,5.801557


In [34]:
# Apply a square root transformation to 'col 4'.
df["sqrt_amount col 4"] = np.sqrt(df["col 4"])
# Save the DataFrame with the square root transformed column.
df.to_csv(r'dt_sqrt_col4.csv', index=0)

In [35]:
# Display the DataFrame to show all the transformed columns.
df

Unnamed: 0,Col 1,col 2,col 3,col 4,normalized column 1,standardized col 2,natural_log_amount col 3,log10_amount col 3,sqrt_amount col 4
0,9707,245,495276,0.019599,0.970188,-0.408101,13.112870,5.694847,0.139997
1,8593,204,513860,0.016722,0.845802,-0.671431,13.149706,5.710845,0.129315
2,9529,424,976523,0.009758,0.950313,0.741558,13.791754,5.989682,0.098783
3,4553,293,150696,0.030213,0.394707,-0.099813,11.923020,5.178102,0.173819
4,1418,285,142106,0.009978,0.044663,-0.151194,11.864329,5.152612,0.099892
...,...,...,...,...,...,...,...,...,...
204,3422,109,832699,0.004110,0.268423,-1.281585,13.632428,5.920488,0.064106
205,5761,416,160218,0.035957,0.529589,0.690177,11.984291,5.204711,0.189624
206,9702,399,421017,0.023044,0.969629,0.580991,12.950428,5.624300,0.151803
207,4813,531,633224,0.007601,0.423738,1.428785,13.358580,5.801557,0.087182


# another appraoch

In [36]:
# Load the dataset again to demonstrate another approach.
df = pd.read_csv('datatransform.csv', header=0)
# Display the first 5 rows of the original data.
df.head()

Unnamed: 0,Col 1,col 2,col 3,col 4
0,9707,245,495276,0.019599
1,8593,204,513860,0.016722
2,9529,424,976523,0.009758
3,4553,293,150696,0.030213
4,1418,285,142106,0.009978


In [37]:
# Use scikit-learn's MinMaxScaler to apply min-max normalization to the entire DataFrame.
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
# Get the column names.
names = df.columns
# Fit the scaler to the data and transform it.
d = scaler.fit_transform(df)
scaled_df = pd.DataFrame(d, columns=names)
# Save the scaled DataFrame to a new CSV file.
scaled_df.to_csv(r'dt_minmax_scaler.csv', index=0)
# Display the first 5 rows of the scaled data.
scaled_df.head()

Unnamed: 0,Col 1,col 2,col 3,col 4
0,0.970188,0.38018,0.489992,0.038325
1,0.845802,0.306306,0.508836,0.03228
2,0.950313,0.702703,0.977979,0.017646
3,0.394707,0.466667,0.140586,0.060627
4,0.044663,0.452252,0.131875,0.018109


In [38]:
# Normalize a single column ('Col 1') using scikit-learn's normalize function.
x_array = np.array(df['Col 1'])
normalized_arr = preprocessing.normalize([x_array])
print(normalized_arr)

[[0.11131226 0.09853778 0.10927109 0.05221023 0.01626051 0.02496413
  0.07361952 0.03930961 0.09359541 0.07802293 0.09392796 0.0849262
  0.04270391 0.11295207 0.10415672 0.04225669 0.10140458 0.07529374
  0.08846957 0.01737283 0.03594972 0.07968568 0.10743634 0.06428521
  0.05020347 0.0466601  0.02389768 0.07583269 0.04158012 0.01648986
  0.04210761 0.06984681 0.03159218 0.05890708 0.07857336 0.05804704
  0.03219994 0.07066098 0.0397339  0.06981241 0.02542282 0.09038459
  0.06578741 0.06889503 0.06840194 0.0161229  0.08860717 0.0168224
  0.08783887 0.03761247 0.11099118 0.0690785  0.0842611  0.10555572
  0.05899882 0.07066098 0.08631373 0.07544281 0.03965363 0.08663481
  0.04727933 0.09967304 0.09569391 0.07617671 0.1121723  0.08337812
  0.04416025 0.05629256 0.10420259 0.05857454 0.10268891 0.01642105
  0.0760047  0.03054866 0.10422552 0.06595942 0.07223199 0.05378124
  0.07230079 0.05678565 0.02778506 0.10055601 0.02031991 0.06302382
  0.02722317 0.06426227 0.07758718 0.05164834 0.07

In [39]:
# Normalize the entire DataFrame along the columns (axis=0).
d = preprocessing.normalize(df, axis=0)
scaled_df1 = pd.DataFrame(d, columns=names)
# Save the normalized DataFrame to a new CSV file.
scaled_df1.to_csv(r'dt_normalized.csv', index=0)
scaled_df1

Unnamed: 0,Col 1,col 2,col 3,col 4
0,0.111312,0.049060,0.061574,0.018667
1,0.098538,0.040850,0.063885,0.015927
2,0.109271,0.084904,0.121404,0.009294
3,0.052210,0.058672,0.018735,0.028776
4,0.016261,0.057070,0.017667,0.009504
...,...,...,...,...
204,0.039241,0.021827,0.103524,0.003914
205,0.066063,0.083302,0.019919,0.034247
206,0.111255,0.079898,0.052342,0.021948
207,0.055192,0.106331,0.078724,0.007239
