In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer, LabelEncoder, OneHotEncoder, OrdinalEncoder

In [2]:
data = pd.read_csv('data/penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [4]:
print(data.isnull().sum())

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64


In [5]:
mean_impute_strategy = SimpleImputer()
median_impute_strategy = SimpleImputer(strategy="median")
mode_impute_strategy = SimpleImputer(strategy="most_frequent")

In [6]:
data.dtypes

species               object
island                object
culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [8]:
data['culmen_length_mm'] = mean_impute_strategy.fit_transform(data['culmen_length_mm'].to_numpy().reshape(-1,1))
data['culmen_depth_mm'] = median_impute_strategy.fit_transform(data['culmen_depth_mm'].to_numpy().reshape(-1,1))
data['flipper_length_mm'] = mean_impute_strategy.fit_transform(data['flipper_length_mm'].to_numpy().reshape(-1,1))
data['body_mass_g'] = mean_impute_strategy.fit_transform(data['body_mass_g'].to_numpy().reshape(-1,1))
data['sex'] = mode_impute_strategy.fit_transform(data['sex'].to_numpy().reshape(-1,1))

In [9]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [10]:
data['sex'].value_counts()

MALE      178
FEMALE    165
.           1
Name: sex, dtype: int64

In [13]:
data.loc[~data['sex'].isin(['FEMALE', 'MALE'])]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [20]:
data = data.loc[data['sex'].isin(['FEMALE', 'MALE'])]

In [22]:
data.loc[~data['sex'].isin(['FEMALE', 'MALE'])]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex


In [23]:
data.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [24]:
data.dtypes

species               object
island                object
culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object

In [25]:
data["species"] = data["species"].astype('category')
data["island"] = data["island"].astype('category')
data["sex"] = data["sex"].astype('category')

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 343 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            343 non-null    category
 1   island             343 non-null    category
 2   culmen_length_mm   343 non-null    float64 
 3   culmen_depth_mm    343 non-null    float64 
 4   flipper_length_mm  343 non-null    float64 
 5   body_mass_g        343 non-null    float64 
 6   sex                343 non-null    category
dtypes: category(3), float64(4)
memory usage: 14.8 KB


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 343 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            343 non-null    category
 1   island             343 non-null    category
 2   culmen_length_mm   343 non-null    float64 
 3   culmen_depth_mm    343 non-null    float64 
 4   flipper_length_mm  343 non-null    float64 
 5   body_mass_g        343 non-null    float64 
 6   sex                343 non-null    category
dtypes: category(3), float64(4)
memory usage: 14.8 KB


In [28]:
categorical_data = data.drop(['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm','body_mass_g', "species"], axis=1)
categorical_data.head()

Unnamed: 0,island,sex
0,Torgersen,MALE
1,Torgersen,FEMALE
2,Torgersen,FEMALE
3,Torgersen,MALE
4,Torgersen,FEMALE


In [30]:
data["species"].value_counts()

Adelie       152
Gentoo       123
Chinstrap     68
Name: species, dtype: int64

In [31]:
le = LabelEncoder()
data["species"] = le.fit_transform(data["species"])
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,0,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,0,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,0,Torgersen,43.92193,17.3,200.915205,4201.754386,MALE
4,0,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [32]:
ohe = OneHotEncoder(sparse=False, drop="first")
cat_encoded = ohe.fit_transform(categorical_data)
cat_df = pd.DataFrame(cat_encoded, columns=ohe.get_feature_names_out())
cat_df

Unnamed: 0,island_Dream,island_Torgersen,sex_MALE
0,0.0,1.0,1.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,1.0
4,0.0,1.0,0.0
...,...,...,...
338,0.0,0.0,1.0
339,0.0,0.0,0.0
340,0.0,0.0,1.0
341,0.0,0.0,0.0


In [36]:
bin_data = data[['culmen_length_mm']].copy()
bin_data['culmen_length_bin'] = pd.cut(data['culmen_length_mm'], bins=[0, 50, 60, 100],
                                       labels=["Low", "Mid", "High"])
bin_data

Unnamed: 0,culmen_length_mm,culmen_length_bin
0,39.10000,Low
1,39.50000,Low
2,40.30000,Low
3,43.92193,Low
4,36.70000,Low
...,...,...
339,43.92193,Low
340,46.80000,Low
341,50.40000,Mid
342,45.20000,Low


In [37]:
bin_data['culmen_length_bin'].value_counts()

Low     291
Mid      52
High      0
Name: culmen_length_bin, dtype: int64

In [39]:
scaled_data = data[['body_mass_g']]

print('Mean:', scaled_data['body_mass_g'].mean())
print('Standard Deviation:', scaled_data['body_mass_g'].std())

Mean: 4199.791570763644
Standard Deviation: 799.9508688401579


In [40]:
standard_scaler = StandardScaler()
scaled_data['body_mass_scaled'] = standard_scaler.fit_transform(scaled_data[['body_mass_g']])

print('Mean:', scaled_data['body_mass_scaled'].mean())
print('Standard Deviation:', scaled_data['body_mass_scaled'].std())
scaled_data['body_mass_scaled']

Mean: -1.6313481178165566e-16
Standard Deviation: 1.0014609211587777


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scaled_data['body_mass_scaled'] = standard_scaler.fit_transform(scaled_data[['body_mass_g']])


0     -0.563095
1     -0.500500
2     -1.189047
3      0.002457
4     -0.938666
         ...   
339    0.002457
340    0.813998
341    1.940711
342    1.252164
343    1.502545
Name: body_mass_scaled, Length: 343, dtype: float64

In [41]:
scaled_data['body_mass_g'].max()

6300.0

In [43]:
scaled_data['body_mass_scaled'].max()

2.629257308286938

In [44]:
minmax_scaler = MinMaxScaler()
scaled_data['body_mass_min_max_scaled'] = minmax_scaler.fit_transform(scaled_data[['body_mass_g']])

print('Mean:', scaled_data['body_mass_min_max_scaled'].mean())
print('Standard Deviation:', scaled_data['body_mass_min_max_scaled'].std())

Mean: 0.4166087696565679
Standard Deviation: 0.2222085746778217


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scaled_data['body_mass_min_max_scaled'] = minmax_scaler.fit_transform(scaled_data[['body_mass_g']])
