In [11]:
from pathlib import Path
import pandas as pd
import janitor


%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data

## Wrangling data takes time

## Wrangling data can sometimes mean messy code

In [12]:
import datetime as dt 

df = pd.read_excel("../../examples/notebooks/dirty_data.xlsx")
# Remove the empty column and empty row
df = df.drop("do not edit! --->", axis=1).drop(7, axis=0)
# Rename columns to something nicer.
df = df.rename(
    mapper={
        "First Name": "first_name",
        "Last Name": "last_name",
        "Employee Status": "employee_status",
        "Subject": "subject",
        "Hire Date": "hire_date",
        "% Allocated": "percentage_allocated",
        "Full time?": "is_full_time",
        "Certification": "certification",
    },
    axis=1
)
# Get the "hire date" into shape.
df["hire_date"] = pd.TimedeltaIndex(df["hire_date"], unit="d") + dt.datetime(1899, 12, 30)

# Those certification columns don't look particularly good. Should just have one of them. 
df['certification'] = df['certification'].combine_first(df['Certification.1'])
df = df.drop(["Certification.1", "Certification.2"], axis=1)

# Add a column for "gratitude points" given by students to the teachers.
df = df.assign(gratitude_points=[10, 50, 20, 1000, 392, 115, 12, 182, 1190, 582, 25, 317])

import numpy as np

# Finally, log10 transform the gratitude_points column.
df["gratitude_points_log"] = df["gratitude_points"].apply(np.log10)

df

Unnamed: 0,first_name,last_name,employee_status,subject,hire_date,percentage_allocated,is_full_time,certification,gratitude_points,gratitude_points_log
0,Jason,Bourne,Teacher,PE,2008-08-30,0.75,Yes,Physical ed,10,1.0
1,Jason,Bourne,Teacher,Drafting,2008-08-30,0.25,Yes,Physical ed,50,1.69897
2,Alicia,Keys,Teacher,Music,2001-08-15,1.0,Yes,Instr. music,20,1.30103
3,Ada,Lovelace,Teacher,,1975-05-01,1.0,Yes,PENDING,1000,3.0
4,Desus,Nice,Administration,Dean,2013-06-06,1.0,Yes,PENDING,392,2.593286
5,Chien-Shiung,Wu,Teacher,Physics,1930-03-20,0.5,Yes,Science 6-12,115,2.060698
6,Chien-Shiung,Wu,Teacher,Chemistry,1930-03-20,0.5,Yes,Science 6-12,12,1.079181
8,James,Joyce,Teacher,English,1990-05-01,0.5,No,English 6-12,182,2.260071
9,Hedy,Lamarr,Teacher,Science,1976-06-08,0.5,No,PENDING,1190,3.075547
10,Carlos,Boozer,Coach,Basketball,2015-08-05,,No,Physical ed,582,2.764923


In [13]:
df = (
    pd.read_excel("../../examples/notebooks/dirty_data.xlsx")
    .remove_empty()
    .clean_names(strip_underscores=True)
    .coalesce(["certification", "certification_1"])
    .convert_excel_date("hire_date")
    .rename_column("%_allocated", "percent_allocated")
    .add_column("gratitude_points", [10, 50, 20, 1000, 392, 115, 12, 182, 1190, 582, 25, 317])
    .transform_column("gratitude_points", np.log10, "gratitude_log")
)
df

Unnamed: 0,first_name,last_name,employee_status,subject,hire_date,percent_allocated,full_time,certification,gratitude_points,gratitude_log
0,Jason,Bourne,Teacher,PE,2008-08-30,0.75,Yes,Physical ed,10,1.0
1,Jason,Bourne,Teacher,Drafting,2008-08-30,0.25,Yes,Physical ed,50,1.69897
2,Alicia,Keys,Teacher,Music,2001-08-15,1.0,Yes,Instr. music,20,1.30103
3,Ada,Lovelace,Teacher,,1975-05-01,1.0,Yes,PENDING,1000,3.0
4,Desus,Nice,Administration,Dean,2013-06-06,1.0,Yes,PENDING,392,2.593286
5,Chien-Shiung,Wu,Teacher,Physics,1930-03-20,0.5,Yes,Science 6-12,115,2.060698
6,Chien-Shiung,Wu,Teacher,Chemistry,1930-03-20,0.5,Yes,Science 6-12,12,1.079181
7,James,Joyce,Teacher,English,1990-05-01,0.5,No,English 6-12,182,2.260071
8,Hedy,Lamarr,Teacher,Science,1976-06-08,0.5,No,PENDING,1190,3.075547
9,Carlos,Boozer,Coach,Basketball,2015-08-05,,No,Physical ed,582,2.764923


In [9]:
df.find_replace("employee_status", {"Administration": "Admin"})

Unnamed: 0,first_name,last_name,employee_status,subject,hire_date,percent_allocated,full_time,certification,gratitude_points
0,Jason,Bourne,Teacher,PE,2008-08-30,0.75,Yes,Physical ed,10
1,Jason,Bourne,Teacher,Drafting,2008-08-30,0.25,Yes,Physical ed,50
2,Alicia,Keys,Teacher,Music,2001-08-15,1.0,Yes,Instr. music,20
3,Ada,Lovelace,Teacher,,1975-05-01,1.0,Yes,PENDING,1000
4,Desus,Nice,Admin,Dean,2013-06-06,1.0,Yes,PENDING,392
5,Chien-Shiung,Wu,Teacher,Physics,1930-03-20,0.5,Yes,Science 6-12,115
6,Chien-Shiung,Wu,Teacher,Chemistry,1930-03-20,0.5,Yes,Science 6-12,12
7,James,Joyce,Teacher,English,1990-05-01,0.5,No,English 6-12,182
8,Hedy,Lamarr,Teacher,Science,1976-06-08,0.5,No,PENDING,1190
9,Carlos,Boozer,Coach,Basketball,2015-08-05,,No,Physical ed,582
