In [74]:
import pandas as pd
import numpy as np
from io import StringIO

csv_data="""A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,"""

df=pd.read_csv(StringIO(csv_data))

In [75]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [76]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [77]:
"""Although scikit-learn was developed for working with NumPy arrays,
it can sometimes be more convenient to preprocess data using pandas' 
DataFrame. We can always access the underlying NumPy array of the 
DataFrame via the values attribute before we feed it into a scikit-learn estimator:"""
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [78]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [79]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [80]:
df.dropna(how='all')# only drop rows where all columns are NaN

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [81]:
df.dropna(thresh=4)#drop rows that have not at least 4 non-NaN values
   

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [82]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [83]:
from sklearn.preprocessing import Imputer
#Other options for the strategy parameter are median or most_frequent
imp=Imputer(missing_values='NaN', strategy='mean', axis=0) # axis 0 takes mean of column
imp=imp.fit(df)
imputed_data=imp.transform(df.values)

In [84]:
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [85]:
dff=pd.DataFrame([['green', 'M', 10.1, 'class1'], 
                 ['red', 'L', 13.5, 'class2'],
                 ['blue', 'XL', 15.3, 'class1']])
dff.columns = ['color', 'size', 'price', 'classlabel']

In [86]:
dff

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [87]:
size_mapping = {
    'XL': 3,
    'L': 2,
    'M':1
}
dff['size']=dff['size'].map(size_mapping)

In [88]:
dff

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [89]:
#inv_size_mapping = {v: k for k, v in size_mapping.items()}
#dff['size']=dff['size'].map(inv_size_mapping)

In [90]:
dff

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [91]:
#class_mapping={label:idx for idx ,label in enumerate(np.unique(dff['classlabel']))}
class_mapping={
    'class1':0, 'class2':1
}

In [92]:
class_mapping

{'class1': 0, 'class2': 1}

In [93]:
"""We can reverse the key-value pairs in the mapping dictionary 
    as follows to map the converted class labels back to the original string representation:
   inv_class_mapping = {v: k for k, v in class_mapping.items()}
   dff['classlabel'] = dff['classlabel'].map(inv_class_mapping)
   dff"""
#

dff['classlabel']=dff['classlabel'].map(class_mapping)


In [94]:
dff

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [108]:
from sklearn.preprocessing import LabelEncoder
class_le=LabelEncoder()
y=class_le.fit_transform(dff['classlabel'].values)#c
class_le.inverse_transform(y)

  if diff:


array([0, 1, 0])

In [109]:
X = dff[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [110]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [111]:
dff1=pd.get_dummies(dff[['price', 'color', 'size', 'classlabel']])

In [112]:
dff1

Unnamed: 0,price,size,classlabel,color_blue,color_green,color_red
0,10.1,1,0,0,1,0
1,13.5,2,1,0,0,1
2,15.3,3,0,1,0,0
