In [48]:
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer


from IPython import display

In [146]:
'''
Suppose initially the data is represented as a nested list,
where each nested list corresponds to an instance, and
corresponding elements in the nested lists belong
to the same attribute.  For example, 'DM', 'AM' and 'F' 
belong to the same attribute.

Below, there are three attributes and six instances.
'''
data = [
    ['DM', np.nan, 4.5],
    ['AM', 6, 3.],
    ['LM', 1, 9.],
    [np.nan, 9, 10],
    ['F', 1, np.nan],
    ['F', 10, 5.3]
]

In [22]:
'''
Even though some attributes are float, some strings, 
when converted into a NumPy array, the array has the
type string.  I guess this is because floats can
be turned into strings, but strings cannot be turned
into floats.  In this case, the whole array has
the numpy.dtype = '<U3', which means unicode of 
length 3 (I think).
'''

df = np.array(data)

print(df)
print()
print(df.dtype)

[['DM' 'nan' '4.5']
 ['AM' '6' '3.0']
 ['LM' '1' '9.0']
 ['nan' '9' '10']
 ['F' '1' 'nan']
 ['F' '10' '5.3']]

<U3


In [149]:
'''
When an numpy.array of type strings is turned
into a pandas.DataFrame, all the columns have
numpy.dtype = object, or `dtype('O')`.
'''

df = pd.DataFrame(np.array(data))
print(df)
print()
print(df.dtypes)
print()
display.display(df.values.dtype)
print()
print(df.applymap(type))

     0    1    2
0   DM  nan  4.5
1   AM    6  3.0
2   LM    1  9.0
3  nan    9   10
4    F    1  nan
5    F   10  5.3

0    object
1    object
2    object
dtype: object



dtype('O')


               0              1              2
0  <class 'str'>  <class 'str'>  <class 'str'>
1  <class 'str'>  <class 'str'>  <class 'str'>
2  <class 'str'>  <class 'str'>  <class 'str'>
3  <class 'str'>  <class 'str'>  <class 'str'>
4  <class 'str'>  <class 'str'>  <class 'str'>
5  <class 'str'>  <class 'str'>  <class 'str'>


In [46]:
'''
If the original nested list is passed to
pandas.DataFrame, each column of the resulting
DataFrames has a numpy.dtype that is the same as
the type of the elements of the column.  

The column whose elements are of type string,
or a mixture of float and string, has
numpy.dtype = object.  The column whose elements are
of type float has numpy.dtype = float64. And so on.
'''

df = pd.DataFrame(data)

print(df)
print()
print(df.dtypes)
print()
print(df.values.dtype)
print()
print(df[0].dtypes)
print()
print(df.applymap(type))

     0     1     2
0   DM   NaN   4.5
1   AM   6.0   3.0
2   LM   1.0   9.0
3  NaN   9.0  10.0
4    F   1.0   NaN
5    F  10.0   5.3

0     object
1    float64
2    float64
dtype: object

object

object

                 0                1                2
0    <class 'str'>  <class 'float'>  <class 'float'>
1    <class 'str'>  <class 'float'>  <class 'float'>
2    <class 'str'>  <class 'float'>  <class 'float'>
3  <class 'float'>  <class 'float'>  <class 'float'>
4    <class 'str'>  <class 'float'>  <class 'float'>
5    <class 'str'>  <class 'float'>  <class 'float'>


In [115]:
'''
Notice that numpy.dtype is kind of funny in that
display.display and print process it differently.

For example, when numpy.dtype has the "Object" value,
print() returns `object`, while display.display()
returns `dtype('O')`.
'''


df = pd.DataFrame(data)

display.display(df)
print()
display.display(df.dtypes)
print()
display.display(df.values.dtype)
print()
display.display(df[0].dtypes)
print()
display.display(df.applymap(type))

Unnamed: 0,0,1,2
0,DM,,4.5
1,AM,6.0,3.0
2,LM,1.0,9.0
3,,9.0,10.0
4,F,1.0,
5,F,10.0,5.3





0     object
1    float64
2    float64
dtype: object




dtype('O')




dtype('O')




Unnamed: 0,0,1,2
0,<class 'str'>,<class 'float'>,<class 'float'>
1,<class 'str'>,<class 'float'>,<class 'float'>
2,<class 'str'>,<class 'float'>,<class 'float'>
3,<class 'float'>,<class 'float'>,<class 'float'>
4,<class 'str'>,<class 'float'>,<class 'float'>
5,<class 'str'>,<class 'float'>,<class 'float'>


In [155]:
df = pd.DataFrame(data)

display.display(type(df))
print()
display.display(type(df.dtypes))
print()
display.display(type(df.values.dtype))
print()
display.display(type(df[0].dtypes))
print()
display.display(type(df.applymap(type)))

pandas.core.frame.DataFrame




pandas.core.series.Series




numpy.dtype




numpy.dtype




pandas.core.frame.DataFrame

In [151]:
s='''
dtype and sklearn.preprocessing.LabelBinarizer

Though, this probably also applies to many other sklearn estimators
in general.
'''

In [156]:
df

Unnamed: 0,0,1,2
0,DM,,4.5
1,AM,6.0,3.0
2,LM,1.0,9.0
3,,9.0,10.0
4,F,1.0,
5,F,10.0,5.3


In [157]:
'''
The encoder works only when there are no
missing values, like NaN.
'''

encoder = LabelBinarizer()

encoder.fit_transform(df[0].fillna('GK'))

array([[0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0]])

In [159]:
'''
Notice that this column has numpy.dtype = float64
and all its element are also of type float.

And the missing value also needs to be filled first.
'''

print(df[1].dtypes)
print()
print((df[1].apply(type) == float).all())


encoder = LabelBinarizer()
encoder.fit_transform(df[1].fillna(100))

float64

True


array([[0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

In [161]:
'''
If the values are of type float, they
cannot have any decimals, so this column
is first multiplied by 100.
'''

print(df[2].dtypes)
print()
print((df[2].apply(type) == float).all())


encoder = LabelBinarizer()
encoder.fit_transform((1e2 * df[2]).fillna(0))

float64

True


array([[0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0]])

In [162]:
'''
Let's fill in all the missing values
in all columns here. 
'''

dff = df.copy()

dff[0] = dff[0].fillna('GK')
dff[1] = dff[1].fillna(1)
dff[2] = dff[2].fillna(100)

In [163]:
'''
Note that if, instead of using .copy(),
we try to construct the same DataFrame
this way, through .values, then the 
resulting columns have numpy.dtype
because dff.values is a numpy array
that necessarily has just ONE numpy.dtype.
'''

df1 = pd.DataFrame(dff.values,
                   index=dff.index,
                   columns=dff.columns)

df1.dtypes

0    object
1    object
2    object
dtype: object

In [165]:
'''
Note how that even though the column
has numpy.dtype = object, its elements
can all have the type float.
'''

print(df1[1].dtype)
print()
print(df1[1].apply(type))

object

0    <class 'float'>
1    <class 'float'>
2    <class 'float'>
3    <class 'float'>
4    <class 'float'>
5    <class 'float'>
Name: 1, dtype: object


In [166]:
'''
When the numpy.dtype is object and
the elements' type is float, the encoder
does not work
'''

encoder = LabelBinarizer()
encoder.fit_transform(df1[1].values)

ValueError: Unknown label type: (array([1.0, 6.0, 1.0, 9.0, 1.0, 10.0], dtype=object),)

In [170]:
'''
pandas.to_numeric() can be used to convert
the numpy.dtype of a column to float64.
Then the encoder works.

So, it would appear that the column type
needs to be consistent with the elements'
type for encoder to work.
'''
print(pd.to_numeric(df1[1]).dtype)
print()
print(pd.to_numeric(df1[1]).apply(type))

encoder.fit_transform(pd.to_numeric(df1[1]))

float64

0    <class 'float'>
1    <class 'float'>
2    <class 'float'>
3    <class 'float'>
4    <class 'float'>
5    <class 'float'>
Name: 1, dtype: object


array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1]])

In [174]:
'''
We can convert the numpy.dtype of multiple columns
of a dataframe at the same time, but we have
to identify those columns whose values can actually
be converted to numbers, so columns containing strings
would raise error.
'''

df1[[1, 2]].apply(pd.to_numeric).dtypes

1    float64
2    float64
dtype: object