In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder

In [3]:
data = pd.read_csv('data/penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
print(data.isnull().sum())

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64


In [6]:
data = data.dropna()
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [7]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            334 non-null    object 
 1   island             334 non-null    object 
 2   culmen_length_mm   334 non-null    float64
 3   culmen_depth_mm    334 non-null    float64
 4   flipper_length_mm  334 non-null    float64
 5   body_mass_g        334 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.9+ KB


In [9]:
data = pd.read_csv('./data/penguins_size.csv')
data.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [13]:
mean_impute_strategy = SimpleImputer(strategy="mean")
median_impute_strategy = SimpleImputer(strategy="median")
mode_impute_strategy = SimpleImputer(strategy="most_frequent")

data['culmen_length_mm'] = mean_impute_strategy.fit_transform(data[['culmen_length_mm']])
data['culmen_depth_mm'] = median_impute_strategy.fit_transform(data[['culmen_depth_mm']])
data['flipper_length_mm'] = mean_impute_strategy.fit_transform(data[['flipper_length_mm']])
data['body_mass_g'] = mean_impute_strategy.fit_transform(data[['body_mass_g']])
data['sex'] = mode_impute_strategy.fit_transform(data[['sex']])

In [14]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   344 non-null    float64
 3   culmen_depth_mm    344 non-null    float64
 4   flipper_length_mm  344 non-null    float64
 5   body_mass_g        344 non-null    float64
 6   sex                344 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [16]:
data["sex"].value_counts()

MALE      178
FEMALE    165
.           1
Name: sex, dtype: int64

In [17]:
data.loc[~data["sex"].isin(["MALE", "FEMALE"])]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [31]:
data = data.loc[data["sex"].isin(["MALE", "FEMALE"])].copy()

In [32]:
data["sex"].value_counts()

MALE      178
FEMALE    165
Name: sex, dtype: int64

In [33]:
data["species"] = data["species"].astype('category')
data["island"] = data["island"].astype('category')
data["sex"] = data["sex"].astype('category')
data.dtypes

species              category
island               category
culmen_length_mm      float64
culmen_depth_mm       float64
flipper_length_mm     float64
body_mass_g           float64
sex                  category
dtype: object

In [34]:
categorical_data = data.drop(['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm','body_mass_g', "species"], axis=1)
categorical_data.head()

Unnamed: 0,island,sex
0,Torgersen,MALE
1,Torgersen,FEMALE
2,Torgersen,FEMALE
3,Torgersen,MALE
4,Torgersen,FEMALE


In [35]:
le = LabelEncoder()
data["species"] = le.fit_transform(data["species"])
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,0,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,0,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,0,Torgersen,43.92193,17.3,200.915205,4201.754386,MALE
4,0,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [36]:
data["species"].value_counts()

0    152
2    123
1     68
Name: species, dtype: int64

In [39]:
ohe = OneHotEncoder(sparse=False, drop="first")
cat_encoded = ohe.fit_transform(categorical_data)

In [40]:
cat_df = pd.DataFrame(cat_encoded, columns=ohe.get_feature_names_out())
cat_df

Unnamed: 0,island_Dream,island_Torgersen,sex_MALE
0,0.0,1.0,1.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,1.0
4,0.0,1.0,0.0
...,...,...,...
338,0.0,0.0,1.0
339,0.0,0.0,0.0
340,0.0,0.0,1.0
341,0.0,0.0,0.0


In [43]:
scaled_data = data[['body_mass_g']].copy()

print('Mean:', scaled_data['body_mass_g'].mean())
print('Standard Deviation:', scaled_data['body_mass_g'].std())

Mean: 4199.791570763644
Standard Deviation: 799.9508688401579


In [44]:
standard_scaler = StandardScaler()
scaled_data['body_mass_scaled'] = standard_scaler.fit_transform(scaled_data[['body_mass_g']])

print('Mean:', scaled_data['body_mass_scaled'].mean())
print('Standard Deviation:', scaled_data['body_mass_scaled'].std())
scaled_data['body_mass_scaled']

Mean: -1.6313481178165566e-16
Standard Deviation: 1.0014609211587777


0     -0.563095
1     -0.500500
2     -1.189047
3      0.002457
4     -0.938666
         ...   
339    0.002457
340    0.813998
341    1.940711
342    1.252164
343    1.502545
Name: body_mass_scaled, Length: 343, dtype: float64

In [45]:
scaled_data

Unnamed: 0,body_mass_g,body_mass_scaled
0,3750.000000,-0.563095
1,3800.000000,-0.500500
2,3250.000000,-1.189047
3,4201.754386,0.002457
4,3450.000000,-0.938666
...,...,...
339,4201.754386,0.002457
340,4850.000000,0.813998
341,5750.000000,1.940711
342,5200.000000,1.252164


In [46]:
minmax_scaler = MinMaxScaler()
scaled_data['body_mass_min_max_scaled'] = minmax_scaler.fit_transform(scaled_data[['body_mass_g']])

print('Mean:', scaled_data['body_mass_min_max_scaled'].mean())
print('Standard Deviation:', scaled_data['body_mass_min_max_scaled'].std())

Mean: 0.4166087696565679
Standard Deviation: 0.2222085746778217


In [47]:
scaled_data

Unnamed: 0,body_mass_g,body_mass_scaled,body_mass_min_max_scaled
0,3750.000000,-0.563095,0.291667
1,3800.000000,-0.500500,0.305556
2,3250.000000,-1.189047,0.152778
3,4201.754386,0.002457,0.417154
4,3450.000000,-0.938666,0.208333
...,...,...,...
339,4201.754386,0.002457,0.417154
340,4850.000000,0.813998,0.597222
341,5750.000000,1.940711,0.847222
342,5200.000000,1.252164,0.694444
