In [1]:
import math
import pandas as pd

In [39]:
def discretization_equal_width(df, attributes):
    n = len(df)
    k = max(10, int(1 + (10/3) * math.log10(n)))

    print("k value:"+str(k))
    discretized_df = df.copy()  
    
    # Dictionary to store intervals for each attribute
    intervals_df = {}

    # Defining Intervals
    for attribute in attributes:
        # Calculating the width of intervals
        min_value = df[attribute].min()
        max_value = df[attribute].max()
        width = (max_value - min_value) / k
        print("min_value for the attribute :"+attribute+" "+str(min_value)+"\n")
        print("max_value for the attribute :"+attribute+" "+str(max_value)+"\n")
        print("width for the attribute :"+attribute+" "+str(width))

        # Calculating intervals
        intervals = []
        for i in range(k):
            interval_min = min_value + i * width
            interval_max = min_value + (i + 1) * width
            intervals.append((interval_min, interval_max))
        
        intervals_df[attribute] = intervals

    # Assigning Categories
    for attribute in attributes:
        discretized_values = []
        
        for value in df[attribute]:
         
            for interval in intervals_df[attribute]:
                interval_min, interval_max = interval
                if interval_min <= value < interval_max:
                    discretized_values.append(interval)
                    break
            else:
               
                discretized_values.append(intervals_df[attribute][-1])
        
     
        discretized_df[attribute + "_discretized"] = discretized_values

    return discretized_df


In [None]:
def average(discretized_df, attributes):
    for attribute in attributes:
        discretized_values = []
        
        for interval in discretized_df[attribute]:
            # Ensure interval is a tuple
            if isinstance(interval, tuple):
                interval_min, interval_max = interval
                discretized_value = (interval_min + interval_max) / 2
                discretized_values.append(discretized_value)
            else:
                print(f"Warning: Expected tuple but got {interval} in {attribute}")
        
        discretized_df[attribute] = discretized_values
    
    return discretized_df


In [25]:
def min_max_normalization(df, attributes):
    
    normalized_df = df.copy() 
    
    for attribute in attributes:
        min_value = df[attribute].min()
        max_value = df[attribute].max()

        if max_value - min_value < 1e-5:
            print(f"Warning: Narrow range detected in {attribute} (Min: {min_value}, Max: {max_value})")

       
        normalized_df[attribute + "_normalized"] = (df[attribute] - min_value) / (max_value - min_value)
    
    return normalized_df


In [9]:
def load_dataset(path):
    df = pd.read_csv(path, delimiter=',' , engine='python')
    return df

In [41]:
path="..\DatasetExos.csv"
df = load_dataset(path)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9009 entries, 0 to 9008
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ep (ms)   8996 non-null   object 
 1   Acc_x     9005 non-null   float64
 2   Acc_y     9009 non-null   float64
 3   Acc_z     9008 non-null   float64
 4   Gyro_x    9008 non-null   float64
 5   Gyro_y    9002 non-null   float64
 6   Gyro_z    9009 non-null   float64
 7   ID        9009 non-null   object 
 8   Exercise  9009 non-null   object 
 9   Category  9003 non-null   object 
 10  Set       8999 non-null   float64
dtypes: float64(7), object(4)
memory usage: 774.3+ KB


In [43]:

# Attributes to normalize
attributes = ['Acc_x', 'Acc_y', 'Acc_z', 'Gyro_x', 'Gyro_y', 'Gyro_z']
normalized_df = min_max_normalization(df, attributes)
normalized_df.head(20)
normalized_df.tail(20)

Unnamed: 0,ep (ms),Acc_x,Acc_y,Acc_z,Gyro_x,Gyro_y,Gyro_z,ID,Exercise,Category,Set,Acc_x_normalized,Acc_y_normalized,Acc_z_normalized,Gyro_x_normalized,Gyro_y_normalized,Gyro_z_normalized
8989,2019-01-20 17:33:24.000,0.062,-0.838,0.151667,21.0002,-13.1094,-5.6952,E,row,medium,40.0,0.090141,0.2221,0.462138,0.629772,0.415169,0.496215
8990,2019-01-20 17:33:24.200,0.0095,-0.389,0.257,-9.0732,5.6828,1.1828,E,row,medium,40.0,0.09014,0.363051,0.525275,0.555941,0.459791,0.496566
8991,2019-01-20 17:33:24.400,0.066,-0.954333,0.140333,-40.2074,12.8658,10.7682,E,row,medium,40.0,0.090141,0.18558,0.455345,0.479507,0.476848,0.497054
8992,2019-01-20 17:33:24.600,0.028,-1.2185,-0.0115,-31.9024,12.427,18.817,E,row,medium,40.0,0.09014,0.102653,0.364336,0.499895,0.475806,0.497464
8993,2019-01-20 17:33:24.800,-0.038667,-1.25,-0.127667,2.4024,-11.951,0.9634,E,row,medium,40.0,0.09014,0.092764,0.294705,0.584114,0.417919,0.496555
8994,2019-01-20 17:33:25.000,-0.021,-0.9235,-0.0255,3.8172,-8.8048,0.8048,E,row,medium,40.0,0.09014,0.19526,0.355944,0.587588,0.42539,0.496547
8995,2019-01-20 17:33:25.200,-0.045,-1.047333,-0.068,-1.5852,-4.2438,2.061,E,row,medium,40.0,0.09014,0.156386,0.33047,0.574325,0.43622,0.496611
8996,2019-01-20 17:33:25.400,-0.0415,-1.311,-0.135,25.866,-8.2072,-17.329,E,row,medium,40.0,0.09014,0.073615,0.29031,0.641718,0.426809,0.495623
8997,2019-01-20 17:33:25.600,0.066667,-1.221333,0.043,34.0366,-1.134,-19.5,E,row,medium,40.0,0.090141,0.101763,0.397003,0.661777,0.443605,0.495512
8998,2019-01-20 17:33:25.800,0.0705,-0.7295,0.1725,20.305,-16.2316,2.8536,E,row,medium,40.0,0.090141,0.256161,0.474625,0.628065,0.407755,0.496651


In [44]:
#  attributes to discretize
discretized_attributes = [attr + "_normalized" for attr in attributes]
discretized_df = discretization_equal_width(normalized_df, discretized_attributes)
print(discretized_df.head(20))
print(discretized_df.tail(20))

k value:14
min_value for the attribute :Acc_x_normalized 0.0

max_value for the attribute :Acc_x_normalized 1.0

width for the attribute :Acc_x_normalized 0.07142857142857142
min_value for the attribute :Acc_y_normalized 0.0

max_value for the attribute :Acc_y_normalized 1.0

width for the attribute :Acc_y_normalized 0.07142857142857142
min_value for the attribute :Acc_z_normalized 0.0

max_value for the attribute :Acc_z_normalized 1.0

width for the attribute :Acc_z_normalized 0.07142857142857142
min_value for the attribute :Gyro_x_normalized 0.0

max_value for the attribute :Gyro_x_normalized 1.0

width for the attribute :Gyro_x_normalized 0.07142857142857142
min_value for the attribute :Gyro_y_normalized 0.0

max_value for the attribute :Gyro_y_normalized 1.0

width for the attribute :Gyro_y_normalized 0.07142857142857142
min_value for the attribute :Gyro_z_normalized 0.0

max_value for the attribute :Gyro_z_normalized 1.0

width for the attribute :Gyro_z_normalized 0.07142857142857

In [50]:
# Averaging the discretized intervals

discretized_attributes2 = [attr + "_discretized" for attr in discretized_attributes]
print(discretized_attributes2)
discretized_df2 = average(discretized_df, discretized_attributes2)

# Saving results
discretized_df2.to_csv("final_discretized_normalized_data.csv", index=False)

['Acc_x_normalized_discretized', 'Acc_y_normalized_discretized', 'Acc_z_normalized_discretized', 'Gyro_x_normalized_discretized', 'Gyro_y_normalized_discretized', 'Gyro_z_normalized_discretized']


In [51]:
print(discretized_df2.head(20))

                    ep (ms)     Acc_x     Acc_y     Acc_z   Gyro_x   Gyro_y  \
0   2019-01-11 15:08:05.200  0.013500  0.977000 -0.071000  -1.8904   2.4392   
1   2019-01-11 15:08:05.400 -0.001500  0.970500 -0.079500  -1.6826  -0.8904   
2   2019-01-11 15:08:05.600  0.001333  0.971667 -0.064333   2.5608  -0.2560   
3   2019-01-11 15:08:05.800 -0.024000  0.957000 -0.073500   8.0610  -4.5244   
4   2019-01-11 15:08:06.000 -0.028000  0.957667 -0.115000   2.4390  -1.5486   
5   2019-01-11 15:08:06.200 -0.026000  0.965000 -0.118000   0.4634   5.2194   
6   2019-01-11 15:08:06.400 -0.048667  0.790000 -0.145333  21.6950   8.1708   
7   2019-01-11 15:08:06.600 -0.170000  0.899500 -0.250000  17.5246   1.5976   
8   2019-01-11 15:08:06.800 -0.222667  0.907000 -0.204333  -7.2318  -1.3536   
9   2019-01-11 15:08:07.000 -0.204500  0.930000 -0.149000 -28.6830 -10.2076   
10  2019-01-11 15:08:07.200 -0.205000  1.404667 -0.095000  -4.1098  -9.3172   
11  2019-01-11 15:08:07.400 -0.163500  0.996000 -0.1