<a href="https://colab.research.google.com/github/strzelnat/machine_learning_study/blob/main/supervised/basics/02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit-learn



In [2]:
import numpy as np
import pandas as pd
import sklearn

In [3]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}

In [5]:
df_raw = pd.DataFrame(data = data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [6]:
df = df_raw.copy()

In [7]:
df.isnull().sum()

Unnamed: 0,0
size,1
color,0
gender,1
price,1
weight,2
bought,0


In [8]:
df.isnull().sum().sum()

np.int64(5)

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')

In [14]:
imputer.fit_transform(df[['weight']])

array([[500.],
       [450.],
       [300.],
       [415.],
       [410.],
       [415.]])

In [17]:
df['weight'] = imputer.fit_transform(df[['weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [20]:
imputer2 = SimpleImputer(missing_values=np.nan,strategy = 'constant', fill_value = 99.8)
df['price'] = imputer.fit_transform(df[['price']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,117.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [24]:
imputer3 = SimpleImputer(missing_values=np.nan,strategy = 'constant', fill_value = 'L')
df['size'] = imputer3.fit_transform(df[['size']]).ravel()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,117.0,300.0,yes
3,L,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [28]:
df['gender'].fillna(value = 'male', inplace = True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gender'].fillna(value = 'male', inplace = True)


Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,117.0,300.0,yes
3,L,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [29]:
df.isnull().sum().sum()

np.int64(0)

In [32]:
df[~df.notnull()]

Unnamed: 0,size,color,gender,price,weight,bought
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
5,,,,,,


In [35]:
df[~df['bought'].notnull()]

Unnamed: 0,size,color,gender,price,weight,bought
