# Expose explicitly missing values with `complete`

In [1]:
import pandas as pd
import numpy as np 
import janitor


In [2]:
np.random.seed(5)
size = 25
dict = {'Customer':np.random.choice( ['Bob'], size),
        'Grouping': np.random.choice( ['Corn','Wheat','Soy'], size),
        'Date':np.random.choice( pd.date_range('1/1/2018','12/12/2022', freq='D'), size),
        'Data': np.random.randint(20,100, size=(size))
        }
df = pd.DataFrame(dict)

# create the Sub-Group column
df['Sub-Group'] = np.nan
df.loc[df['Grouping'] == 'Corn', 'Sub-Group'] = np.random.choice(['White', 'Dry'], size=len(df[df['Grouping'] == 'Corn']))
df.loc[df['Grouping'] == 'Wheat', 'Sub-Group'] = np.random.choice(['SRW', 'HRW', 'SWW'], size=len(df[df['Grouping'] == 'Wheat']))
df.loc[df['Grouping'] == 'Soy', 'Sub-Group'] = np.random.choice(['Beans', 'Meal'], size=len(df[df['Grouping'] == 'Soy']))

df['Year'] = df.Date.dt.year

out = (df.groupby(['Customer','Grouping','Sub-Group',df['Date'].dt.month,'Year'])
    .agg(Units = ('Data','sum'))
    .unstack()
    )

out

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Units,Units,Units,Units,Units
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Year,2018,2019,2020,2021,2022
Customer,Grouping,Sub-Group,Date,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bob,Corn,Dry,1,,,,,87.0
Bob,Corn,Dry,2,,,,25.0,
Bob,Corn,Dry,3,,,,23.0,
Bob,Corn,Dry,6,,,53.0,,
Bob,Corn,White,1,47.0,,,,
Bob,Corn,White,2,,,,27.0,
Bob,Corn,White,3,,,29.0,,
Bob,Corn,White,4,75.0,,,,
Bob,Corn,White,6,,51.0,,,
Bob,Soy,Beans,1,,27.0,,,


In [3]:
out.columns

MultiIndex([('Units', 2018),
            ('Units', 2019),
            ('Units', 2020),
            ('Units', 2021),
            ('Units', 2022)],
           names=[None, 'Year'])

In [4]:
out.complete([('Units', 2018)])

[[('Units', 2018)]]

In [5]:
# from http://imachordata.com/2016/02/05/you-complete-me/
df = pd.DataFrame(
        {
            "Year": [1999, 2000, 2004, 1999, 2004],
            "Taxon": [
                "Saccharina",
                "Saccharina",
                "Saccharina",
                "Agarum",
                "Agarum",
            ],
            "Abundance": [4, 5, 2, 1, 8],
        }
    )

df

Unnamed: 0,Year,Taxon,Abundance
0,1999,Saccharina,4
1,2000,Saccharina,5
2,2004,Saccharina,2
3,1999,Agarum,1
4,2004,Agarum,8


Note that Year 2000 and Agarum pairing is missing in the DataFrame above. Let’s make it explicit:

In [6]:
df.complete('Year', 'Taxon')

['Year', 'Taxon']

What if we wanted the explicit missing values for all the years from 1999 to 2004? Easy - simply pass a dictionary pairing the column name with the new values:

In [7]:
new_year_values = {'Year': range(df.Year.min(), df.Year.max() + 1)}

df.complete(new_year_values, "Taxon", sort = True)

[{'Year': range(1999, 2005)}, 'Taxon']

You can pass a callable as values in the dictionary:

In [8]:
new_year_values = lambda year: range(year.min(), year.max() + 1)

df.complete({"Year": new_year_values}, "Taxon")

[{'Year': <function __main__.<lambda>(year)>}, 'Taxon']

You can get explicit rows, based only on existing data:

In [9]:
# https://stackoverflow.com/q/62266057/7175713
df = {"Name" : ("Bob", "Bob", "Emma"), 
       "Age" : (23,23,78), 
       "Gender" :("Male", "Male", "Female"), 
       "Item" : ("house", "car", "house"), 
       "Value" : (5,1,3)
        }
df = pd.DataFrame(df)
df

Unnamed: 0,Name,Age,Gender,Item,Value
0,Bob,23,Male,house,5
1,Bob,23,Male,car,1
2,Emma,78,Female,house,3


In the DataFrame above, there is no `car` Item value for the `Name`, `Age`, `Gender`  combination -> `(Emma, 78, Female)`. Pass `(Name, Age, Gender)` and `Item` to explicitly expose the missing row:

In [10]:
df.complete(('Name', 'Age', 'Gender'), 'Item')

[['Name', 'Age', 'Gender'], 'Item']

The example above showed how to expose missing rows on a group basis. There is also the option of exposing missing rows with the `by` parameter: 

In [11]:
df = pd.DataFrame(
        {
            "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
            "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
            "value": [1, 3, 1, 2, 3, 2, 5],
        }
    )

df

Unnamed: 0,state,year,value
0,CA,2010,1
1,CA,2013,3
2,HI,2010,1
3,HI,2012,2
4,HI,2016,3
5,NY,2009,2
6,NY,2013,5


Let's expose all the missing years, based on the minimum and maximum year, for each state:

In [12]:
new_year_values = lambda year: range(year.min(), year.max() + 1)

df.complete(
    {'year': new_year_values},
    by='state',
)



[{'year': <function __main__.<lambda>(year)>}]

You can fill the nulls with `fill_value`:

In [13]:
df.complete(
    {'year': new_year_values},
    by='state',
    fill_value = 0
)

[{'year': <function __main__.<lambda>(year)>}]