
# Exercises on Missing Values


In [11]:
import pandas as pd
import numpy as np
import os
os.chdir('C:/Users/cepedazk/Jupyter Notebook/Datasets') # you can change to your own working directory

* `data_missing.csv` is on Moodle.
  * Download the dataset and read it in to Python, making sure all missing values are read in correctly.
  * Are there any unexpected values in the OWN_OCCUPIED column? how about NUM_BEDROOMS? If so, replace them with NaN.
* Use Python code to show which columns have missing values.
* Find the number of missing values in each column.
* Filter the dataset so we only see the rows with no missing values.
* Filter the dataset so we only see the rows with two or less missing values.
* Show the columns ST_NUM and ST_NAME for the rows with missing values in OWN_OCCUPIED.
* Show the columns ST_NUM and ST_NAME for the rows that do not have missing values in OWN_OCCUPIED.
* Fill in the missing values of NUM_BEDROOMS with the previous value in the column.
* Fill in the missing values of NUM_BEDROOMS with the median value of the column. Store this as the new dataset.

#### Quick Notes on Missing Values

* Replacing missing values with substituted values is called imputation.
* This can be a very advanced topic.
* There are many methods for imputation, for example mean imputation, regression imputation.
* We have used `fillna()` and `replace()` for imputation.
* There are other functions in Python for imputation eg: df.interpolate(method = 'linear', axis = 0)
* “Interpolation is a type of estimation, a method of constructing new data points within the range of a discrete set of known data points.”

In [1]:
# You can start the exercises here or create a new notebook


Once you are ready, solutions are available below. Just expand the block of cells.

### Solutions

In [93]:
# We will not be looking at any of the columns after Referee. 
mycsv = pd.read_csv("data_missing.csv")

In [94]:
# Are there any unexpected values in the OWN_OCCUPIED column? If so, replace them with NaN.
mycsv

Unnamed: 0,ST_NUM,ST_NAME,NUM_BEDROOMS,OWN_OCCUPIED
0,104.0,PUTNAM,3,Y
1,197.0,LEXINGTON,3,N
2,,LEXINGTON,,N
3,201.0,BERKELEY,1,12
4,203.0,BERKELEY,3,Y
5,207.0,BERKELEY,,Y
6,,WASHINGTON,2,
7,213.0,TREMONT,--,Y
8,215.0,TREMONT,na,Y


In [95]:
# Are there any unexpected values in the OWN_OCCUPIED column? If so, replace them with NaN.
mycsv["OWN_OCCUPIED"].replace('12', np.nan, inplace=True)
mycsv["NUM_BEDROOMS"].replace('--', np.nan, inplace=True)
mycsv["NUM_BEDROOMS"].replace('na', np.nan, inplace=True)

# or
# mycsv = pd.read_csv("data_missing.csv", na_values=['12','--','na'])

In [96]:
display(mycsv)

Unnamed: 0,ST_NUM,ST_NAME,NUM_BEDROOMS,OWN_OCCUPIED
0,104.0,PUTNAM,3.0,Y
1,197.0,LEXINGTON,3.0,N
2,,LEXINGTON,,N
3,201.0,BERKELEY,1.0,
4,203.0,BERKELEY,3.0,Y
5,207.0,BERKELEY,,Y
6,,WASHINGTON,2.0,
7,213.0,TREMONT,,Y
8,215.0,TREMONT,,Y


In [52]:
# Use Python code to show which columns have missing values.
mycsv.isna().any()

ST_NUM           True
ST_NAME         False
NUM_BEDROOMS     True
OWN_OCCUPIED     True
dtype: bool

In [53]:
# Find the number of missing values in each column.
mycsv.isna().sum()

ST_NUM          2
ST_NAME         0
NUM_BEDROOMS    4
OWN_OCCUPIED    2
dtype: int64

In [58]:
# Filter the dataset so we only see the rows with no missing values.

mycsv[mycsv.notna().all(axis=1)] # where "all" returns all rows (axis=1) that does not contain missing values

# another alternative
# mycsv.dropna() # it does not alter the original dataframe, so it is ok to use it to see the rows with no missing values.

Unnamed: 0,ST_NUM,ST_NAME,NUM_BEDROOMS,OWN_OCCUPIED
0,104.0,PUTNAM,3,Y
1,197.0,LEXINGTON,3,N
4,203.0,BERKELEY,3,Y


In [59]:
# Filter the dataset so we only see the rows with two or less missing values.
mycsv[mycsv.isna().any(axis=1)]

Unnamed: 0,ST_NUM,ST_NAME,NUM_BEDROOMS,OWN_OCCUPIED
2,,LEXINGTON,,N
3,201.0,BERKELEY,1.0,
5,207.0,BERKELEY,,Y
6,,WASHINGTON,2.0,
7,213.0,TREMONT,,Y
8,215.0,TREMONT,,Y


In [68]:
# Show the columns ST_NUM and ST_NAME for the rows with missing values in OWN_OCCUPIED.
mycsv.loc[mycsv["OWN_OCCUPIED"].isna(), ["ST_NUM", "ST_NAME"]]

Unnamed: 0,ST_NUM,ST_NAME
3,201.0,BERKELEY
6,,WASHINGTON


In [78]:
# Show the columns ST_NUM and ST_NAME for the rows that do not have missing values in OWN_OCCUPIED.
mycsv.loc[mycsv["OWN_OCCUPIED"].notna(), ["ST_NUM", "ST_NAME"]]

Unnamed: 0,ST_NUM,ST_NAME
0,104.0,PUTNAM
1,197.0,LEXINGTON
2,,LEXINGTON
4,203.0,BERKELEY
5,207.0,BERKELEY
7,213.0,TREMONT
8,215.0,TREMONT


In [87]:
# Fill in the missing values of NUM_BEDROOMS with the previous value in the column.
mycsv[['NUM_BEDROOMS']].fillna(method='ffill', axis=0)

Unnamed: 0,NUM_BEDROOMS
0,3
1,3
2,3
3,1
4,3
5,3
6,2
7,2
8,2


In [109]:
# Fill in the missing values of NUM_BEDROOMS with the median value of the column. Store this as the new dataset.
num_bed_median = mycsv['NUM_BEDROOMS'].median()
mycsv['NUM_BEDROOMS'] = mycsv.loc[:,['NUM_BEDROOMS']].fillna(num_bed_median)
display(mycsv)

Unnamed: 0,ST_NUM,ST_NAME,NUM_BEDROOMS,OWN_OCCUPIED
0,104.0,PUTNAM,3.0,Y
1,197.0,LEXINGTON,3.0,N
2,,LEXINGTON,3.0,N
3,201.0,BERKELEY,1.0,
4,203.0,BERKELEY,3.0,Y
5,207.0,BERKELEY,3.0,Y
6,,WASHINGTON,2.0,
7,213.0,TREMONT,3.0,Y
8,215.0,TREMONT,3.0,Y


In [119]:
# Extra
# If you inspect the data type of NUM_BEDROOMS, you will see it as "object"
mycsv.dtypes

ST_NUM          float64
ST_NAME          object
NUM_BEDROOMS     object
OWN_OCCUPIED     object
dtype: object

In [131]:
# to change the data type, use astype(int) to convert object to int type
mycsv.NUM_BEDROOMS = mycsv.NUM_BEDROOMS.astype(int)

In [129]:
mycsv.dtypes

ST_NUM          float64
ST_NAME          object
NUM_BEDROOMS      int32
OWN_OCCUPIED     object
dtype: object