# Tutorial 2: Data Manipulation
In this tutorial we will simulate how I converted a manual process of checking for changes from month to month into an automated process. 

In [20]:
import pandas as pd
import numpy as np

In [21]:
# bring in the data
july = pd.read_csv('./data/2023-07-04.csv')
august = pd.read_csv('./data/2023-08-04.csv')

## Prepping Data
First we need to make changes to the data so that our process has changes to recognize

In [22]:
# choose subset of employees to change things for
changes = august['employee_id'].sample(len(august) // 10)

In [23]:
# look at the employees selected before changes
august.loc[august['employee_id'].isin(changes.values)]


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73,0
36,64062,Technology,region_28,Bachelor's,m,sourcing,1,33,4.0,4,0,75,0
52,38330,Procurement,region_15,Bachelor's,m,sourcing,1,40,3.0,12,0,65,0
62,9150,Analytics,region_22,Bachelor's,f,other,1,28,,1,0,80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54756,33335,Operations,region_32,Bachelor's,f,sourcing,1,35,5.0,4,0,63,0
54764,52939,Sales & Marketing,region_22,Bachelor's,m,sourcing,1,46,1.0,4,0,47,0
54781,76497,HR,region_27,Bachelor's,f,sourcing,1,38,5.0,3,0,50,0
54795,13477,Procurement,region_15,Master's & above,m,other,1,35,3.0,7,0,70,0


In [24]:
# change the department for the chosen employees
unique_departments = august['department'].unique()
august.loc[august['employee_id'].isin(changes.values), 'department'] = np.random.choice(unique_departments, size=len(changes), replace=True)

In [25]:
# add a training for these employees
august.loc[august['employee_id'].isin(changes.values), 'no_of_trainings'] += 1

In [26]:
# give these employees a promotion
august.loc[august['employee_id'].isin(changes.values), 'is_promoted'] = 1

In [27]:
# check out employees with changes
august.loc[august['employee_id'].isin(changes.values)]

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
3,2542,Analytics,region_23,Bachelor's,m,other,3,39,1.0,10,0,50,1
4,48945,Procurement,region_26,Bachelor's,m,other,2,45,3.0,2,0,73,1
36,64062,Finance,region_28,Bachelor's,m,sourcing,2,33,4.0,4,0,75,1
52,38330,HR,region_15,Bachelor's,m,sourcing,2,40,3.0,12,0,65,1
62,9150,Procurement,region_22,Bachelor's,f,other,2,28,,1,0,80,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54756,33335,Legal,region_32,Bachelor's,f,sourcing,2,35,5.0,4,0,63,1
54764,52939,Sales & Marketing,region_22,Bachelor's,m,sourcing,2,46,1.0,4,0,47,1
54781,76497,Procurement,region_27,Bachelor's,f,sourcing,2,38,5.0,3,0,50,1
54795,13477,Legal,region_15,Master's & above,m,other,2,35,3.0,7,0,70,1


In [29]:
# save the changed august dataframe to a csv file so that we can read it in like we would in real life
august.to_csv('./data/august.csv', index=False)

# save july as a different name for simplicity
july.to_csv('./data/july.csv', index=False)

## Automate Checking for Changes
Here we will automate what used to be a manual process for tracking and looking at changes.