# Expose explicitly missing values with `complete`

In [1]:
import pandas as pd
import numpy as np 
import janitor

In [2]:
# from http://imachordata.com/2016/02/05/you-complete-me/
df = pd.DataFrame(
        {
            "Year": [1999, 2000, 2004, 1999, 2004],
            "Taxon": [
                "Saccharina",
                "Saccharina",
                "Saccharina",
                "Agarum",
                "Agarum",
            ],
            "Abundance": [4, 5, 2, 1, 8],
        }
    )

df

Unnamed: 0,Year,Taxon,Abundance
0,1999,Saccharina,4
1,2000,Saccharina,5
2,2004,Saccharina,2
3,1999,Agarum,1
4,2004,Agarum,8


Note that Year 2000 and Agarum pairing is missing in the DataFrame above. Let’s make it explicit:

In [3]:
df.complete('Year', 'Taxon')

Unnamed: 0,Year,Taxon,Abundance
0,1999,Saccharina,4.0
1,2000,Saccharina,5.0
2,2004,Saccharina,2.0
3,1999,Agarum,1.0
4,2004,Agarum,8.0
5,2000,Agarum,


In [4]:
# A better viewing based on order
df.complete('Year', 'Taxon', sort = True)

Unnamed: 0,Year,Taxon,Abundance
0,1999,Agarum,1.0
1,1999,Saccharina,4.0
2,2000,Agarum,
3,2000,Saccharina,5.0
4,2004,Agarum,8.0
5,2004,Saccharina,2.0


What if we wanted the explicit missing values for all the years from 1999 to 2004? Easy - simply pass a dictionary pairing the column name with the new values:

In [5]:
new_year_values = {'Year': range(df.Year.min(), df.Year.max() + 1)}

df.complete(new_year_values, "Taxon")

Unnamed: 0,Year,Taxon,Abundance
0,1999,Saccharina,4.0
1,2000,Saccharina,5.0
2,2004,Saccharina,2.0
3,1999,Agarum,1.0
4,2004,Agarum,8.0
5,2000,Agarum,
6,2001,Saccharina,
7,2001,Agarum,
8,2002,Saccharina,
9,2002,Agarum,


You can pass a callable as values in the dictionary:

In [6]:
new_year_values = lambda year: range(year.min(), year.max() + 1)

df.complete({"Year": new_year_values}, "Taxon", sort = True)

Unnamed: 0,Year,Taxon,Abundance
0,1999,Agarum,1.0
1,1999,Saccharina,4.0
2,2000,Agarum,
3,2000,Saccharina,5.0
4,2001,Agarum,
5,2001,Saccharina,
6,2002,Agarum,
7,2002,Saccharina,
8,2003,Agarum,
9,2003,Saccharina,


You can get explcit rows, based only on existing data:

In [7]:
# https://stackoverflow.com/q/62266057/7175713
df = {"Name" : ("Bob", "Bob", "Emma"), 
       "Age" : (23,23,78), 
       "Gender" :("Male", "Male", "Female"), 
       "Item" : ("house", "car", "house"), 
       "Value" : (5,1,3)
        }
df = pd.DataFrame(df)
df

Unnamed: 0,Name,Age,Gender,Item,Value
0,Bob,23,Male,house,5
1,Bob,23,Male,car,1
2,Emma,78,Female,house,3


In the DataFrame above, there is no `car` Item value for the `Name`, `Age`, `Gender`  combination -> `(Emma, 78, Female)`. Pass `(Name, Age, Gender)` and `Item` to explicitly expose the missing row:

In [8]:
df.complete(('Name', 'Age', 'Gender'), 'Item')

Unnamed: 0,Name,Age,Gender,Item,Value
0,Bob,23,Male,house,5.0
1,Bob,23,Male,car,1.0
2,Emma,78,Female,house,3.0
3,Emma,78,Female,car,


The example above showed how to expose missing rows on a group basis. There is also the option of exposing missing rows with the `by` parameter: 

In [9]:
df = pd.DataFrame(
        {
            "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
            "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
            "value": [1, 3, 1, 2, 3, 2, 5],
        }
    )

df

Unnamed: 0,state,year,value
0,CA,2010,1
1,CA,2013,3
2,HI,2010,1
3,HI,2012,2
4,HI,2016,3
5,NY,2009,2
6,NY,2013,5


Let's expose all the missing years, based on the minimum and maximum year, for each state:

In [10]:
result = df.complete(
            {'year': new_year_values},
            by='state',
            sort = True
        )

result

Unnamed: 0,state,year,value
0,CA,2010,1.0
1,CA,2011,
2,CA,2012,
3,CA,2013,3.0
4,HI,2010,1.0
5,HI,2011,
6,HI,2012,2.0
7,HI,2013,
8,HI,2014,
9,HI,2015,


You can fill the nulls with Pandas' `fillna`:

In [11]:
result.fillna(0, downcast = 'infer')

Unnamed: 0,state,year,value
0,CA,2010,1
1,CA,2011,0
2,CA,2012,0
3,CA,2013,3
4,HI,2010,1
5,HI,2011,0
6,HI,2012,2
7,HI,2013,0
8,HI,2014,0
9,HI,2015,0
