In [53]:
import numpy as np 
import pandas as pd 
import holidays

## Load/combine files, remove unnecessary columns, fix data types, add columns to help with date & times

In [3]:
# Load the files into dataframes
early = pd.read_csv("_data/macianos.csv")
later = pd.read_csv("_data/macianos2.csv")

In [4]:
early.head()

Unnamed: 0,Job,Clocked In,Clocked Out,Duration,Hourly Rate,Earnings,Comment,Breaks,Adjustments,TotalTimeAdjustment,TotalEarningsAdjustment,TotalMileage
0,Macianos,8/3/20 4:40 PM,8/3/20 6:45 PM,2.08,0,59,,,59 tips,0,59,0
1,Macianos,8/4/20 4:08 PM,8/4/20 9:52 PM,5.73,0,108,,,108 tips,0,108,0
2,Macianos,8/5/20 5:01 PM,8/5/20 9:15 PM,4.23,0,81,,,81 tips,0,81,0
3,Macianos,8/6/20 4:25 PM,8/6/20 10:17 PM,5.87,0,124,,,124 tips,0,124,0
4,Macianos,8/7/20 4:25 PM,8/7/20 9:30 PM,5.08,0,122,,,122 tips,0,122,0


In [21]:
later.head()

Unnamed: 0,Job,Clocked In,Clocked Out,Duration,Hourly Rate,Earnings,Comment,Breaks,Adjustments,TotalTimeAdjustment,TotalEarningsAdjustment,TotalMileage
0,Maciano 2,7/10/21 4:19 PM,7/10/21 8:45 PM,4.43,0,175,,,175 tips,0.0,175,0
1,Maciano 2,7/11/21 12:47 PM,7/11/21 9:03 PM,8.27,0,338,,,338 tips,0.0,338,0
2,Maciano 2,7/15/21 4:26 PM,7/15/21 8:38 PM,4.2,0,147,,,147 tips,0.0,147,0
3,Maciano 2,7/16/21 4:16 PM,7/16/21 9:56 PM,5.67,0,162,,,162 tips,0.0,162,0
4,Maciano 2,7/17/21 4:24 PM,7/17/21 8:30 PM,4.1,0,109,,,109 tips,0.0,109,0


In [12]:
# concat both dataframes into a single dataframe
df = pd.concat([early, later])
df.head()

Unnamed: 0,Job,Clocked In,Clocked Out,Duration,Hourly Rate,Earnings,Comment,Breaks,Adjustments,TotalTimeAdjustment,TotalEarningsAdjustment,TotalMileage
0,Macianos,8/3/20 4:40 PM,8/3/20 6:45 PM,2.08,0,59,,,59 tips,0.0,59,0
1,Macianos,8/4/20 4:08 PM,8/4/20 9:52 PM,5.73,0,108,,,108 tips,0.0,108,0
2,Macianos,8/5/20 5:01 PM,8/5/20 9:15 PM,4.23,0,81,,,81 tips,0.0,81,0
3,Macianos,8/6/20 4:25 PM,8/6/20 10:17 PM,5.87,0,124,,,124 tips,0.0,124,0
4,Macianos,8/7/20 4:25 PM,8/7/20 9:30 PM,5.08,0,122,,,122 tips,0.0,122,0


In [25]:
# drop columns: Hourly Rate, Comments, Breaks, Adjustments, TotalTimeAdjustment, TotalEarningsAdjustment, TotalMileage
cols_to_drop = ['Hourly Rate', 'Comment', 'Breaks', 'Adjustments', 
                'TotalTimeAdjustment', 'TotalEarningsAdjustment', 'TotalMileage']
df = df.drop(cols_to_drop, axis = 1)

In [29]:
# check the info of the dataframe to check the column's value types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 308 entries, 0 to 127
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Job          308 non-null    object 
 1   Clocked In   308 non-null    object 
 2   Clocked Out  308 non-null    object 
 3   Duration     308 non-null    float64
 4   Earnings     308 non-null    int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 14.4+ KB


In [34]:
# convert Clocked In & Clocked Out columns to datetime
df['Clocked In']   = pd.to_datetime(df['Clocked In'])
df['Clocked Out']  = pd.to_datetime(df['Clocked Out'])

In [38]:
# create date column by extracting data and time from a type datetime64 column
df['Date'] = df['Clocked In'].dt.date

# overwrite Clocked In/Clocked Out columns to ONLY have times, NOT dates
df['Clocked In']   = df['Clocked In'].dt.time
df['Clocked Out']  = df['Clocked Out'].dt.time

In [51]:
# create Month, Day, and Year columns
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Year'] = df['Date'].dt.year

## Locate the holidays

To Do:
- 

In [57]:
# create a column specifying the holiday for a specific day (if it was one)
us_holidays = holidays.UnitedStates()

# boolean: holiday or not?
df['Is Holiday'] = df['Date'].apply(lambda x: x in us_holidays)

# Specify WHICH holiday it is
df['Holiday Name'] = df['Date'].apply(lambda x: us_holidays.get(x))

In [68]:
df['Holiday Name'].value_counts()

Holiday Name
New Year's Day                                     2
Memorial Day                                       1
Juneteenth National Independence Day (Observed)    1
Juneteenth National Independence Day               1
Independence Day                                   1
Christmas Day (Observed)                           1
New Year's Day (Observed)                          1
Name: count, dtype: int64

In [78]:
# checking to see which holidays the 'holiday' package was able to identify

for row in range(len(df)):
  if(df.iloc[row]['Holiday Name'] != None):
    print(row, df.iloc[row]['Holiday Name'], df.iloc[row]['Month'], df.iloc[row]['Day'])

85 New Year's Day 1 1
160 Memorial Day 5 31
170 Juneteenth National Independence Day (Observed) 6 18
171 Juneteenth National Independence Day 6 19
179 Independence Day 7 4
231 Christmas Day (Observed) 12 24
233 New Year's Day (Observed) 12 31
234 New Year's Day 1 1


In [95]:
# hard code for holidays (those whose dates do not change)

# for row in range(len(df)):
  
#   if df.loc[row, 'Month'] == 2 and df.loc[row, 'Day'] == 14:
#     df.loc[row, 'Holiday Name'] = "Valentine's Day"

  # # Valentine's Day
  # if( df.iloc[row]['Month'] == 2 and df.iloc[row]['Day'] == 14):
  #   df.iloc[row]['Holiday Name'] = "Valentine's Day"

  # # St Patrick's Day
  # elif( df.iloc[row]['Month'] == 3 and df.iloc[row]['Day'] == 17):
  #   df.iloc[row]['Holiday Name'] = "St. Patrick's Day"

  # # Halloween
  # elif( df.iloc[row]['Month'] == 10 and df.iloc[row]['Day'] == 31):
  #   df.iloc[row]['Holiday Name'] = "Halloween"

  # # Christmas Eve
  # elif( df.iloc[row]['Month'] == 12 and df.iloc[row]['Day'] == 24):
  #   df.iloc[row]['Holiday Name'] = "Christmas Eve"

  # # New Year's Eve
  # elif( df.iloc[row]['Month'] == 12 and df.iloc[row]['Day'] == 31):
  #   df.iloc[row]['Holiday Name'] = "New Year's Eve"

  # Valentine's Day
  # if df.loc[row, 'Month'] == 2 and df.loc[row, 'Day'] == 14:
  #   df.loc[row, 'Holiday Name'] = "Valentine's Day"

  # # St Patrick's Day
  # elif df.loc[row, 'Month'] == 3 and df.loc[row, 'Day'] == 17:
  #   df.loc[row, 'Holiday Name'] = "St. Patrick's Day"

  # # Halloween
  # elif df.loc[row, 'Month'] == 10 and df.loc[row, 'Day'] == 31:
  #   df.loc[row, 'Holiday Name'] = "Halloween"

  # # Christmas Eve
  # elif df.loc[row, 'Month'] == 12 and df.loc[row, 'Day'] == 24:
  #   df.loc[row, 'Holiday Name'] = "Christmas Eve"

  # # New Year's Eve
  # elif df.loc[row, 'Month'] == 12 and df.loc[row, 'Day'] == 31:
  #   df.loc[row, 'Holiday Name'] = "New Year's Eve"


for i in range(5):
  print(type(df.iloc[row]['Holiday Name']))
    



<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>


In [86]:
df['Holiday Name'].value_counts()

Holiday Name
New Year's Day                                     2
Memorial Day                                       1
Juneteenth National Independence Day (Observed)    1
Juneteenth National Independence Day               1
Independence Day                                   1
Christmas Day (Observed)                           1
New Year's Day (Observed)                          1
Name: count, dtype: int64

In [80]:
# fix the dataframe to account for more holidays, those not listed in the 'holidays' package



In [None]:
# Add a column to indicate whether or not here is Sunday/Monday football. Also, spot the super bowls