In [1]:
import pandas as pd
from io import StringIO

In [2]:
csv_data = '''
A, B, C, D
1.0, 2.0, 3.0, 4.0
5.0, 6.0,, 8.0
9.0, 10.0, 11.0,
13.0, 14.0, 15.0, 16.0
'''

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True
3,False,False,False,False


In [5]:
df.isnull().sum(axis=0)

A     0
 B    0
 C    1
 D    1
dtype: int64

In [6]:
df.isnull().sum(axis=1) #cross 행

0    0
1    1
2    1
3    0
dtype: int64

### Eliminating samples of features with missing values

In [7]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


In [8]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,9.0,10.0
3,13.0,14.0


In [9]:
#remove rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,
3,13.0,14.0,15.0,16.0


In [10]:
#remove rows that have less than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,13.0,14.0,15.0,16.0


In [12]:
#only remove rows where NaN appear in specific columns
df.dropna(subset=['C'])

KeyError: ['C']

### Imputing missing values

In [14]:
from sklearn.preprocessing import Imputer

In [15]:
#axis =0 calculates the column mean
imr = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)



In [16]:
imr = imr.fit(df.values)

In [17]:
imputed_data = imr.transform(df.values)

In [18]:
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  9.66666667,  8.        ],
       [ 9.        , 10.        , 11.        ,  9.33333333],
       [13.        , 14.        , 15.        , 16.        ]])

### Mapping ordinal features

In [20]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']
])

In [21]:
df.columns = ['color', 'size', 'price', 'label']

In [22]:
df

Unnamed: 0,color,size,price,label
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [23]:
size_mapping = {
    'XL':3,
    'L':2,
    'M':1
}

In [24]:
df['size'] = df['size'].map(size_mapping)

In [25]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [26]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}

In [27]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

### Encoding class labels

In [28]:
import numpy as np

In [29]:
unique_labels = np.unique(df['label'])

In [30]:
class_mapping = {label: idx for idx, label in enumerate(unique_labels)}

In [31]:
class_mapping

{'class1': 0, 'class2': 1}

In [32]:
df['label'] = df['label'].map(class_mapping)

In [33]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [34]:
inv_class_mapping = {v:k for k,v in class_mapping.items()}

In [35]:
df['label'].map(inv_class_mapping)

0    class1
1    class2
2    class1
Name: label, dtype: object

In [36]:
#sklearn LabelEncoder

In [37]:
from sklearn.preprocessing import LabelEncoder

In [38]:
class_le = LabelEncoder()

In [39]:
#fit_transform method is a shortcut
y = class_le.fit_transform(df['label'].values)

In [40]:
y

array([0, 1, 0])

### Performing one-hot encoding on nominal features

In [41]:
from sklearn.preprocessing import OneHotEncoder

In [42]:
df

Unnamed: 0,color,size,price,label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [44]:
X=df[['color','size','price']].values
X

array([['green', 1, 10.1],
       ['red', 2, 13.5],
       ['blue', 3, 15.3]], dtype=object)

In [45]:
color_le = LabelEncoder()

In [47]:
X[:,0] = color_le.fit_transform(X[:,0]) ### 열의 인덱스가 0인 값 추출
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [49]:
from sklearn.preprocessing import OneHotEncoder

In [51]:
ohe = OneHotEncoder(categorical_features=[0])

In [52]:
ohe.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [53]:
pd.get_dummies(df[['color','size','price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


In [54]:
#multicollinearity
pd.get_dummies(df[['color','size','price']], drop_first = True)

Unnamed: 0,size,price,color_green,color_red
0,1,10.1,1,0
1,2,13.5,0,1
2,3,15.3,0,0
