### Identifying missing values in tabular data

Transform the CSV file to pandas.DataFrame

In [32]:
import pandas as pd 
from io import StringIO

csv_data = "csv-file"

df = pd.read_csv(csv_data)
print(df)

         A     B     C    D
type1  1.0   2.0   3.0  4.0
type2  5.0   6.0   NaN  8.0
type3  9.0  10.0  11.0  NaN


##### Omit invalid data

##### Impute missing values

Mean, median, the most frequent value

In [33]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
df = imr.transform(df)

print(df)

[[ 1.  2.  3.  4.]
 [ 5.  6.  7.  8.]
 [ 9. 10. 11.  6.]]




## Handle categorical data

Transform categories to number.
* Simply use python dictionary (if it were simple)
* Call LabelEncoder()

In [34]:
# explicitly import data

jersey_df = pd.DataFrame(
    [['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']]
)
jersey_df.columns = ['color', 'size', 'price', 'classlabel']
print(jersey_df)

   color size  price classlabel
0  green    M   10.1     class1
1    red    L   13.5     class2
2   blue   XL   15.3     class1


In [35]:
size_mapping = {
    'M': 1,
    'L': 2,
    'XL': 3
}
inv_size_mapping = {s: n for n, s in size_mapping.items()}

print(jersey_df['size'])
print()

jersey_df['size'] = jersey_df['size'].map(size_mapping)
print(jersey_df)
print()

jersey_df['size'] = jersey_df['size'].map(inv_size_mapping)
print(jersey_df)

jersey_df['size'] = jersey_df['size'].map(size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

   color  size  price classlabel
0  green     1   10.1     class1
1    red     2   13.5     class2
2   blue     3   15.3     class1

   color size  price classlabel
0  green    M   10.1     class1
1    red    L   13.5     class2
2   blue   XL   15.3     class1


In [36]:
# call enumerate rather than hard coding
class_mapping = {label: idx for idx, label in enumerate(jersey_df['classlabel'].unique())}
print(class_mapping)

jersey_df['classlabel'] = jersey_df['classlabel'].map(class_mapping)
print(jersey_df)

{'class1': 0, 'class2': 1}
   color  size  price  classlabel
0  green     1   10.1           0
1    red     2   13.5           1
2   blue     3   15.3           0


In [43]:
# one-hot encoding on nominal features
# nominal feature: feat. without numerical relation
# e.g. red cannot compare with green

from sklearn.preprocessing import LabelEncoder

X = jersey_df[['color', 'size', 'price']].values
print(X)

color_le = LabelEncoder()
color_le.fit_transform(jersey_df['color'].values)

# color_le is array([1, 2, 0]), which is too numerical
# thus we'd better use one-hot encoder

[['green' 1 10.1]
 ['red' 2 13.5]
 ['blue' 3 15.3]]


array([1, 2, 0])

In [64]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()    # column 0
ohe.fit_transform(jersey_df[['color']])

print(ohe.categories_)
print(ohe.get_feature_names_out())

[array(['blue', 'green', 'red'], dtype=object)]
['color_blue' 'color_green' 'color_red']


In [67]:
pd.get_dummies(jersey_df[['price', 'color', 'size']],
               drop_first=True)
# drop_first maintains that the matrix is still of full rank

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,True,False
1,13.5,2,False,True
2,15.3,3,False,False
