In [1]:
import pandas as pd
import numpy as np

In [2]:
dataframe= pd.read_csv('ML_assignment02_dataset_2021.csv')
dataframe.head()

Unnamed: 0,Colour,Radius (cm),Weight (grams),Fruit (class)
0,Green,4.3,122,Pear
1,Green,4.6,152,Pear
2,Green,3.1,85,Apple
3,Green,3.6,173,Pear
4,Green,2.5,65,Lemon


In [3]:
def del_duplicates(dataframe):
    
    df=dataframe.drop_duplicates(keep='first')
    print('DF after dropping duplicates : ',df)
    
    #writing to csv 
    df.to_csv('Output_1.csv')
    
    return df

In [4]:
def class_mean(dataframe: pd.DataFrame, col: str, label: str) -> float:
    return round(dataframe.loc[(dataframe['Fruit (class)'] == label) & (dataframe[col].notna()), col].mean(), 1)

def fill_values(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    This function replaces 0 values with the mean of the corresponding class of the attribute.
    """
    dataframe.replace(0, np.NaN, inplace=True)
    
    cols = dataframe.columns[dataframe.isna().any()].tolist()
    labels = set(dataframe['Fruit (class)'].tolist())
    
    for col in cols:
        for label in labels:
            dataframe.loc[(dataframe[col].isna()) & (dataframe['Fruit (class)'] == label), col] = dataframe.apply(lambda x: class_mean(dataframe, col, label), axis=1)
    
    print('Dataset after filling values ', dataframe)
    dataframe.to_csv('Output_2.csv')
    
    return dataframe

In [5]:
def transform_nominal(dataframe):
    df_nominal = pd.get_dummies(dataframe, columns=["Colour"], drop_first=True)
    print("Dataset after transforming nominal attributes:")
    print(df_nominal)
    df_nominal.to_csv("Output_3.csv", index=False)
    return df_nominal

In [6]:
def normalise_minmax(dataframe):
    columns= dataframe.select_dtypes(include=np.number).columns.to_list()
    for col in columns: 
        dataframe[col] = (dataframe[col] - dataframe[col].min()) / (dataframe[col].max() - dataframe[col].min())
    print('Dataset after min-max normalization:')
    print(dataframe)
    dataframe.to_csv('Output_4.csv', index=False)
    return dataframe

In [7]:
dataframe=del_duplicates(dataframe)

DF after dropping duplicates :      Colour  Radius (cm)  Weight (grams) Fruit (class)
0    Green          4.3             122          Pear
1    Green          4.6             152          Pear
2    Green          3.1              85         Apple
3    Green          3.6             173          Pear
4    Green          2.5              65         Lemon
5    Green          2.5              70         Apple
6    Green          2.7              73         Apple
7    Green          4.5             110          Pear
8    Green          2.5              86         Apple
9    Green          2.4              68         Lemon
10   Green          4.2             126          Pear
11     Red          3.7             101         Apple
12     Red          3.7             100         Apple
13     Red          3.4              80         Apple
14  Yellow          3.2              79         Apple
15  Yellow          3.1              69         Lemon
16  Yellow          2.1              64         Ap

In [8]:
df=fill_values(dataframe)

Dataset after filling values      Colour  Radius (cm)  Weight (grams) Fruit (class)
0    Green          4.3           122.0          Pear
1    Green          4.6           152.0          Pear
2    Green          3.1            85.0         Apple
3    Green          3.6           173.0          Pear
4    Green          2.5            65.0         Lemon
5    Green          2.5            70.0         Apple
6    Green          2.7            73.0         Apple
7    Green          4.5           110.0          Pear
8    Green          2.5            86.0         Apple
9    Green          2.4            68.0         Lemon
10   Green          4.2           126.0          Pear
11     Red          3.7           101.0         Apple
12     Red          3.7           100.0         Apple
13     Red          3.4            80.0         Apple
14  Yellow          3.2            79.0         Apple
15  Yellow          3.1            69.0         Lemon
16  Yellow          2.1            64.0         Appl

In [9]:
dataframe = transform_nominal(dataframe)

Dataset after transforming nominal attributes:
    Radius (cm)  Weight (grams) Fruit (class)  Colour_Red  Colour_Yellow
0           4.3           122.0          Pear           0              0
1           4.6           152.0          Pear           0              0
2           3.1            85.0         Apple           0              0
3           3.6           173.0          Pear           0              0
4           2.5            65.0         Lemon           0              0
5           2.5            70.0         Apple           0              0
6           2.7            73.0         Apple           0              0
7           4.5           110.0          Pear           0              0
8           2.5            86.0         Apple           0              0
9           2.4            68.0         Lemon           0              0
10          4.2           126.0          Pear           0              0
11          3.7           101.0         Apple           1              0
12  

In [10]:
dataframe=normalise_minmax(dataframe)
dataframe

Dataset after min-max normalization:
    Radius (cm)  Weight (grams) Fruit (class)  Colour_Red  Colour_Yellow
0      0.709677        0.616541          Pear         0.0            0.0
1      0.806452        0.842105          Pear         0.0            0.0
2      0.322581        0.338346         Apple         0.0            0.0
3      0.483871        1.000000          Pear         0.0            0.0
4      0.129032        0.187970         Lemon         0.0            0.0
5      0.129032        0.225564         Apple         0.0            0.0
6      0.193548        0.248120         Apple         0.0            0.0
7      0.774194        0.526316          Pear         0.0            0.0
8      0.129032        0.345865         Apple         0.0            0.0
9      0.096774        0.210526         Lemon         0.0            0.0
10     0.677419        0.646617          Pear         0.0            0.0
11     0.516129        0.458647         Apple         1.0            0.0
12     0.51612

Unnamed: 0,Radius (cm),Weight (grams),Fruit (class),Colour_Red,Colour_Yellow
0,0.709677,0.616541,Pear,0.0,0.0
1,0.806452,0.842105,Pear,0.0,0.0
2,0.322581,0.338346,Apple,0.0,0.0
3,0.483871,1.0,Pear,0.0,0.0
4,0.129032,0.18797,Lemon,0.0,0.0
5,0.129032,0.225564,Apple,0.0,0.0
6,0.193548,0.24812,Apple,0.0,0.0
7,0.774194,0.526316,Pear,0.0,0.0
8,0.129032,0.345865,Apple,0.0,0.0
9,0.096774,0.210526,Lemon,0.0,0.0


In [11]:
print("min :",dataframe['Radius (cm)'].min())
print("max :",dataframe['Radius (cm)'].max())
print(4.3-2.1/5.2-2.1)

min : 0.0
max : 1.0
1.796153846153846
