In [48]:
import pandas as pd
import numpy as np

In [80]:
#load datasets

benefits_data = pd.read_csv('data/benefits_data.csv')
employee_data = pd.read_csv('data/employee_data.csv')
feedback_data = pd.read_csv('data/feedback_data.csv')
usage_data = pd.read_csv('data/usage_data.csv')


In [104]:
#data profiling

def profile_dataset(df):

    #Shape
    print("="*50)
    print(f"Dataset shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print("="*50)

    #Data Type Info
    print("\n--- Data Types ---")
    print(df.dtypes)

    #Missing values
    print("\n--- Missing Values ---")
    print(df.isnull().sum()[df.isnull().sum() > 0])

    #Numerical Data Summ
    print("\n--- Numeric Columns Summary ---")
    print(df.describe().T)

    #Outlier Checks ?????????
    print("\n--- Outlier Check Using IQR ---")
    num_cols = df.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        # outliers[col] = outlier_count
        # print(outliers)
        if outlier_count > 0:
            print(f"{col}: {outliers} potential outliers")
        else:
            print(f"{col}: No significant outliers detected")

    #Categorical Data Summ
    print("\n--- Categorical Columns Summary ---")
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        print(f"\nColumn: {col}")
        print(df[col].value_counts(dropna=False).head(10))  # top 10 frequent

    #Duplicate Check
    print("\n--- Duplicate Rows ---")
    dup_count = df.duplicated().sum()
    print(f"Number of duplicate rows: {dup_count}")

    #Column Cardinality
    print("\n--- Cardinality of Columns ---")
    for col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")

    return df


In [105]:
benefits_df = profile_dataset(benefits_data)

Dataset shape: 30 rows × 4 columns

--- Data Types ---
BenefitID           int64
BenefitType        object
BenefitSubType     object
BenefitCost       float64
dtype: object

--- Missing Values ---
Series([], dtype: int64)

--- Numeric Columns Summary ---
             count     mean         std   min     25%      50%       75%  \
BenefitID     30.0   15.500    8.803408   1.0    8.25   15.500   22.7500   
BenefitCost   30.0  523.836  302.652678  65.0  277.33  514.655  779.3125   

                max  
BenefitID     30.00  
BenefitCost  969.28  

--- Outlier Check Using IQR ---
BenefitID: No significant outliers detected
BenefitCost: No significant outliers detected

--- Categorical Columns Summary ---

Column: BenefitType
BenefitType
Retirement Plan              6
Health Insurance             4
Tuition Reimbursement        4
Gym Membership               4
Life Insurance               4
Childcare                    2
Commuter Benefits            1
Technology Stipend           1
Flexible 

In [106]:
employee_df = profile_dataset(employee_data)

Dataset shape: 5000 rows × 5 columns

--- Data Types ---
EmployeeID     int64
Age            int64
Gender        object
Department    object
Tenure         int64
dtype: object

--- Missing Values ---
Series([], dtype: int64)

--- Numeric Columns Summary ---
             count       mean          std   min      25%     50%      75%  \
EmployeeID  5000.0  2500.5000  1443.520003   1.0  1250.75  2500.5  3750.25   
Age         5000.0    43.4662    12.813278  22.0    32.00    43.0    55.00   
Tenure      5000.0    12.8460     9.915863   1.0     5.00    10.0    19.00   

               max  
EmployeeID  5000.0  
Age           65.0  
Tenure        40.0  

--- Outlier Check Using IQR ---
EmployeeID: No significant outliers detected
Age: No significant outliers detected
Tenure: No significant outliers detected

--- Categorical Columns Summary ---

Column: Gender
Gender
Male          2489
Female        2413
Non-Binary      98
Name: count, dtype: int64

Column: Department
Department
IT           1

In [107]:
feedback_df = profile_dataset(feedback_data)

Dataset shape: 30000 rows × 4 columns

--- Data Types ---
EmployeeID            int64
BenefitID             int64
SatisfactionScore     int64
Comments             object
dtype: object

--- Missing Values ---
Series([], dtype: int64)

--- Numeric Columns Summary ---
                     count         mean          std  min     25%     50%  \
EmployeeID         30000.0  2496.277467  1448.017411  1.0  1235.0  2504.0   
BenefitID          30000.0    15.513200     8.673062  1.0     8.0    16.0   
SatisfactionScore  30000.0     3.004300     1.420921  1.0     2.0     3.0   

                      75%     max  
EmployeeID         3751.0  5000.0  
BenefitID            23.0    30.0  
SatisfactionScore     4.0     5.0  

--- Outlier Check Using IQR ---
EmployeeID: No significant outliers detected
BenefitID: No significant outliers detected
SatisfactionScore: No significant outliers detected

--- Categorical Columns Summary ---

Column: Comments
Comments
Unhappy with this benefit.             1258

In [108]:
usage_df = profile_dataset(usage_data)

Dataset shape: 50000 rows × 4 columns

--- Data Types ---
EmployeeID         int64
BenefitID          int64
UsageFrequency     int64
LastUsedDate      object
dtype: object

--- Missing Values ---
Series([], dtype: int64)

--- Numeric Columns Summary ---
                  count        mean          std  min     25%     50%     75%  \
EmployeeID      50000.0  2513.83360  1445.499017  1.0  1263.0  2520.0  3772.0   
BenefitID       50000.0    15.49882     8.680902  1.0     8.0    16.0    23.0   
UsageFrequency  50000.0     3.34836     3.139892  0.0     0.0     3.0     5.0   

                   max  
EmployeeID      5000.0  
BenefitID         30.0  
UsageFrequency    10.0  

--- Outlier Check Using IQR ---
EmployeeID: No significant outliers detected
BenefitID: No significant outliers detected
UsageFrequency: No significant outliers detected

--- Categorical Columns Summary ---

Column: LastUsedDate
LastUsedDate
2024-02-11    170
2024-02-14    169
2024-01-15    168
2024-05-07    167
2024-0

In [86]:
# from ydata_profiling import ProfileReport
# import pandas as pd
# profile = ProfileReport(benefits_data, explorative=True)
# profile.to_file("profile_report.html")

In [87]:
#Merge datasets using EmployeeID and BenefitID
usage_employee = pd.merge(usage_data, employee_data, on='EmployeeID', how='inner')
on_ben = pd.merge(usage_employee, benefits_data, on='BenefitID', how='inner')
merged = pd.merge(on_ben, feedback_data, on=['EmployeeID', 'BenefitID'], how='inner')

In [88]:
#standardize data types
#LastUsedDate --> datetime
#Gender/Department

merged['LastUsedDate'] = pd.to_datetime(merged['LastUsedDate'], errors='coerce')
merged['Gender'] = merged['Gender'].astype('category')
merged['Department'] = merged['Department'].astype('category')

In [89]:
merged

Unnamed: 0,EmployeeID,BenefitID,UsageFrequency,LastUsedDate,Age,Gender,Department,Tenure,BenefitType,BenefitSubType,BenefitCost,SatisfactionScore,Comments
0,220,20,4,2024-05-03,64,Male,HR,35,Tuition Reimbursement,Undergraduate Degree,489.96,1,Not worth the hassle.
1,1820,26,1,2024-02-08,53,Male,Finance,2,Gym Membership,Family Membership,519.66,2,Not many locations available.
2,285,16,2,2023-10-27,64,Male,Marketing,35,Health Insurance,HDHP Individual,84.55,3,"Satisfactory, but could improve."
3,4536,8,8,2024-07-03,32,Female,Sales,10,Wellness Programs,Premium Discount Tier 1,125.00,1,Barely any time off given.
4,1262,12,3,2024-04-13,42,Male,Finance,1,Tuition Reimbursement,Graduate Degree,824.53,3,Helps but limited in scope.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9827,3106,25,0,2023-11-23,33,Male,IT,12,Life Insurance,Supplemental Standard,774.91,1,Disappointing service.
9828,2588,28,5,2024-07-23,32,Male,IT,5,Life Insurance,Dependent Coverage,165.54,4,"Solid plan, happy with it."
9829,533,28,2,2024-01-23,56,Female,Finance,34,Life Insurance,Dependent Coverage,165.54,2,Coverage is minimal.
9830,4288,20,4,2024-04-30,49,Female,Finance,13,Tuition Reimbursement,Undergraduate Degree,489.96,2,Complicated reimbursement process.


In [90]:
#Create derived 
merged['age_group'] = pd.cut(merged['Age'], bins=[17, 25, 41, 57, np.inf], labels=['Gen Z', 'Millennial', 'Gen X', 'Boomer'])

merged['tenure_group'] = pd.cut(merged['Tenure'], bins=[0, 5, 15, np.inf], labels=['<5', '5-15', '>15'])

subcat_dummies = pd.get_dummies(merged['BenefitSubType'], prefix='subcat')
merged = pd.concat([merged, subcat_dummies], axis=1)

In [96]:
merged.columns

Index(['EmployeeID', 'BenefitID', 'UsageFrequency', 'LastUsedDate', 'Age',
       'Gender', 'Department', 'Tenure', 'BenefitType', 'BenefitSubType',
       'BenefitCost', 'SatisfactionScore', 'Comments', 'age_group',
       'tenure_group', 'subcat_401k Basic Matching',
       'subcat_401k Catch-Up Contributions', 'subcat_401k High Contribution',
       'subcat_401k Investment Fees', 'subcat_401k Maximum Matching',
       'subcat_401k Standard Matching', 'subcat_After-School Care',
       'subcat_Basic Coverage', 'subcat_Conference Attendance',
       'subcat_Dependent Coverage', 'subcat_Family Membership',
       'subcat_Graduate Degree', 'subcat_HDHP Individual', 'subcat_HMO Family',
       'subcat_Healthcare FSA', 'subcat_Individual Courses',
       'subcat_Monthly Communications', 'subcat_Monthly Internet Allowance',
       'subcat_On-Site Infant Care', 'subcat_PPO Family',
       'subcat_PPO Individual', 'subcat_Premium Discount Tier 1',
       'subcat_Professional Certification', 

In [109]:
#Data Validation
#drop duplicates based on employee and benefit id
merged = merged.drop_duplicates(subset=['EmployeeID', 'BenefitID'], keep='first')

#no neg usageFrequency values
merged = merged[merged['UsageFrequency'] >= 0]

#satisfactionScore in between 1 and 5
merged = merged[(merged['SatisfactionScore'] >=1) & (merged['SatisfactionScore'] <=5)]

# of age employees
merged = merged[merged['Age'] >= 18]

In [111]:
merged.to_csv('data/cleaned_data.csv', index=False)

profile_dataset(merged)

Dataset shape: 7626 rows × 45 columns

--- Data Types ---
EmployeeID                                     int64
BenefitID                                      int64
UsageFrequency                                 int64
LastUsedDate                          datetime64[ns]
Age                                            int64
Gender                                      category
Department                                  category
Tenure                                         int64
BenefitType                                   object
BenefitSubType                                object
BenefitCost                                  float64
SatisfactionScore                              int64
Comments                                      object
age_group                                   category
tenure_group                                category
subcat_401k Basic Matching                      bool
subcat_401k Catch-Up Contributions              bool
subcat_401k High Contribution            

Unnamed: 0,EmployeeID,BenefitID,UsageFrequency,LastUsedDate,Age,Gender,Department,Tenure,BenefitType,BenefitSubType,...,subcat_PPO Individual,subcat_Premium Discount Tier 1,subcat_Professional Certification,subcat_Supplemental High Amount,subcat_Supplemental Standard,subcat_Tier 1 Partners,subcat_Tier 2 Partners,subcat_Tier 3 Partners,subcat_Transit Subsidy,subcat_Undergraduate Degree
0,220,20,4,2024-05-03,64,Male,HR,35,Tuition Reimbursement,Undergraduate Degree,...,False,False,False,False,False,False,False,False,False,True
1,1820,26,1,2024-02-08,53,Male,Finance,2,Gym Membership,Family Membership,...,False,False,False,False,False,False,False,False,False,False
2,285,16,2,2023-10-27,64,Male,Marketing,35,Health Insurance,HDHP Individual,...,False,False,False,False,False,False,False,False,False,False
3,4536,8,8,2024-07-03,32,Female,Sales,10,Wellness Programs,Premium Discount Tier 1,...,False,True,False,False,False,False,False,False,False,False
4,1262,12,3,2024-04-13,42,Male,Finance,1,Tuition Reimbursement,Graduate Degree,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9827,3106,25,0,2023-11-23,33,Male,IT,12,Life Insurance,Supplemental Standard,...,False,False,False,False,True,False,False,False,False,False
9828,2588,28,5,2024-07-23,32,Male,IT,5,Life Insurance,Dependent Coverage,...,False,False,False,False,False,False,False,False,False,False
9829,533,28,2,2024-01-23,56,Female,Finance,34,Life Insurance,Dependent Coverage,...,False,False,False,False,False,False,False,False,False,False
9830,4288,20,4,2024-04-30,49,Female,Finance,13,Tuition Reimbursement,Undergraduate Degree,...,False,False,False,False,False,False,False,False,False,True
