## Outline

[Missing values](#missing-values)

[Decision tree classifier](#decision-tree)

[Apply a mask](#mask)

[Imputation](#imputation)

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({"feature_1": [0.1,np.NaN,np.NaN,0.4],
                   "feature_2": [1.1,2.2,np.NaN,np.NaN]
                  })
df

Unnamed: 0,feature_1,feature_2
0,0.1,1.1
1,,2.2
2,,
3,0.4,


In [8]:
df.isin([0.1])

Unnamed: 0,feature_1,feature_2
0,True,False
1,False,False
2,False,False
3,False,False


In [11]:
df.isna()==df.isnull()

Unnamed: 0,feature_1,feature_2
0,True,True
1,True,True
2,True,True
3,True,True


In [12]:
df_booleans = pd.DataFrame({"col_1": [True,True,False],
                            "col_2": [True,False,False]
                           })
df_booleans

Unnamed: 0,col_1,col_2
0,True,True
1,True,False
2,False,False


In [15]:
df_booleans.any(axis=1)

0     True
1     True
2    False
dtype: bool

In [17]:
sum(df_booleans.col_2)

1

In [18]:
X = pd.DataFrame({"feature_1":[0,1,2,3]})
y = pd.Series([0,0,1,1])

In [19]:
X, y

(   feature_1
 0          0
 1          1
 2          2
 3          3,
 0    0
 1    0
 2    1
 3    1
 dtype: int64)

In [20]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
dt = DecisionTreeClassifier()
dt.fit(X, y)
dt

DecisionTreeClassifier()

In [24]:
tree_parameters = {'criterion': 'entropy',
                   'max_depth': 10,
                   'min_samples_split': 2
                  }

In [25]:
dt = DecisionTreeClassifier(**tree_parameters)
dt

DecisionTreeClassifier(criterion='entropy', max_depth=10)

In [26]:
df = pd.DataFrame({"feature_1": [0,1,2,3,4]})
df

Unnamed: 0,feature_1
0,0
1,1
2,2
3,3
4,4


In [27]:
mask = df["feature_1"] >= 3
mask

0    False
1    False
2    False
3     True
4     True
Name: feature_1, dtype: bool

In [29]:
df[mask]

Unnamed: 0,feature_1
3,3
4,4


### How to combine two logical operators for Series
What we want is to look at the same row of each of the two series, and compare each pair of items, one row at a time. To do this, use:
- the `&` operator instead of `and`
- the `|` operator instead of `or`.
- Also, you'll need to surround each comparison with parenthese `(...)`

In [32]:
# This will compare the series, one row at a time
(df["feature_1"] >=2) & (df["feature_1" ] <=3)

0    False
1    False
2     True
3     True
4    False
Name: feature_1, dtype: bool

In [33]:
df = pd.DataFrame({"feature_1": [0,1,2,3,4,5,6,7,8,9,10],
                   "feature_2": [0,np.NaN,20,30,40,50,60,70,80,np.NaN,100],
                  })
df

Unnamed: 0,feature_1,feature_2
0,0,0.0
1,1,
2,2,20.0
3,3,30.0
4,4,40.0
5,5,50.0
6,6,60.0
7,7,70.0
8,8,80.0
9,9,


In [34]:
from sklearn.impute import SimpleImputer

In [35]:
mean_imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
mean_imputer

SimpleImputer()

In [36]:
mean_imputer.fit(df)

SimpleImputer()

In [37]:
nparray_imputed_mean = mean_imputer.transform(df)
nparray_imputed_mean

array([[  0.,   0.],
       [  1.,  50.],
       [  2.,  20.],
       [  3.,  30.],
       [  4.,  40.],
       [  5.,  50.],
       [  6.,  60.],
       [  7.,  70.],
       [  8.,  80.],
       [  9.,  50.],
       [ 10., 100.]])

In [39]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [40]:
reg_imputer = IterativeImputer()
reg_imputer

IterativeImputer()

In [41]:
nparray_imputed_reg = reg_imputer.fit_transform(df)
nparray_imputed_reg

array([[  0.,   0.],
       [  1.,  10.],
       [  2.,  20.],
       [  3.,  30.],
       [  4.,  40.],
       [  5.,  50.],
       [  6.,  60.],
       [  7.,  70.],
       [  8.,  80.],
       [  9.,  90.],
       [ 10., 100.]])