# 遺漏值處理

In [96]:
import pandas as pd
from io import StringIO
csv_data =\
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [97]:
print(df.isnull().sum(),'\n') #檢查遺漏直
print(df.dropna(axis=0),'\n') #刪除遺漏值 ex:1.how='all', 2.thresh=4, 3.subset='C' 不建議
print(df.fillna(df.mean()),'\n')#遺漏值快速處理

A    0
B    0
C    1
D    1
dtype: int64 

     A    B    C    D
0  1.0  2.0  3.0  4.0 

      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   7.5  8.0
2  10.0  11.0  12.0  6.0 



In [98]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan,strategy='mean') #most frequent
imr = imr.fit(df.values)
df_fill = imr.transform(df.values)
df_fill

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [99]:
import pandas as pd 
df = pd.DataFrame([
    ['green','M',100,'class2'],
    ['red','L',130,'class1'],
    ['bule','XL',150,'class2']
])
df.columns = ['color','size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,100,class2
1,red,L,130,class1
2,bule,XL,150,class2


In [100]:
#有序特徵轉換
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}

df['size'] = df['size'].map(size_mapping)
print(df)

#inv_size_mapping = {v:k for k,v in size_mapping.items()}  轉回原始字串
#df['size'] = df['size'].map(inv_size_mapping)
#print(df)

   color  size  price classlabel
0  green     1    100     class2
1    red     2    130     class1
2   bule     3    150     class2


In [101]:
#類別標籤編碼
import numpy as np

class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
print(class_mapping)
df['classlabel'] = df['classlabel'].map(class_mapping)
df

inv_class_mapping = {v:k for k,v in class_mapping.items()}    #轉回原始標籤
df['classlabel'] = df['classlabel'].map(inv_class_mapping)

{'class1': 0, 'class2': 1}


Unnamed: 0,color,size,price,classlabel
0,green,1,100,class2
1,red,2,130,class1
2,bule,3,150,class2


In [106]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
print(y)
print(class_le.inverse_transform(y))    #轉回原始標籤

[1 0 1]
['class2' 'class1' 'class2']


# OneHot

In [120]:
#OneHot
from sklearn.preprocessing import OneHotEncoder
X = df[['color','size','price']].values
color_oht = OneHotEncoder()
color_oht.fit_transform(X[:,0].reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [136]:
#OneHot 轉換多個特徵
from sklearn.compose import ColumnTransformer
onehot = OneHotEncoder(categories='auto',drop='first')
c_transf = ColumnTransformer([
    ('onehot',onehot,[0]),
    ('nothing','passthrough',[1,2])
])
c_transf.fit_transform(X).astype(float)

array([[  1.,   0.,   1., 100.],
       [  0.,   1.,   2., 130.],
       [  0.,   0.,   3., 150.]])

In [128]:
pd.get_dummies(df[['color','size','price']]) #轉換字串

Unnamed: 0,size,price,color_bule,color_green,color_red
0,1,100,0,1,0
1,2,130,0,0,1
2,3,150,1,0,0
