# Phase 1: Data Foundation

## 1. Data Quality Assessment

### 1.1 Load the data

In [17]:
import pandas as pd
import numpy as np

benefits_data = pd.read_csv('data/benefits_data.csv')
employee_data = pd.read_csv('data/employee_data.csv')
feedback_data = pd.read_csv('data/feedback_data.csv')
usage_data = pd.read_csv('data/usage_data.csv')


### 1.2 Profile the data to identify issues

In [18]:
#data profiling function

def profile_dataset(df):

    #shape
    print("\n")
    print("="*50)
    print(f"Dataset shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print("="*50)

    #data type info
    print("\n--- Data Types ---")
    print(df.dtypes)

    #missing value
    print("\n--- Missing Values ---")
    print(df.isnull().sum()[df.isnull().sum() > 0])

    #numerical data summ
    print("\n--- Numeric Columns Summary ---")
    print(df.describe().T)

    #outlier checks
    print("\n--- Outlier Check Using IQR ---")
    num_cols = df.select_dtypes(include=[np.number]).columns
    outliers = {}
    for col in num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        if outlier_count > 0:
            print(f"{col}: {outliers} potential outliers")
        else:
            print(f"{col}: No significant outliers detected")

    #categorical data summ
    print("\n--- Categorical Columns Summary ---")
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        print(f"\nColumn: {col}")
        print(df[col].value_counts(dropna=False).head(10))  # top 10 frequent

    #duplicates check
    print("\n--- Duplicate Rows ---")
    dup_count = df.duplicated().sum()
    print(f"Number of duplicate rows: {dup_count}")

    #column cardinality
    print("\n--- Cardinality of Columns ---")
    for col in df.columns:
        print(f"{col}: {df[col].nunique()} unique values")

    return df

benefits_df = profile_dataset(benefits_data)
employee_df = profile_dataset(employee_data)
feedback_df = profile_dataset(feedback_data)
usage_df = profile_dataset(usage_data)

#view output as scrollable element to see all results





Dataset shape: 30 rows × 4 columns

--- Data Types ---
BenefitID           int64
BenefitType        object
BenefitSubType     object
BenefitCost       float64
dtype: object

--- Missing Values ---
Series([], dtype: int64)

--- Numeric Columns Summary ---
             count     mean         std   min     25%      50%       75%  \
BenefitID     30.0   15.500    8.803408   1.0    8.25   15.500   22.7500   
BenefitCost   30.0  523.836  302.652678  65.0  277.33  514.655  779.3125   

                max  
BenefitID     30.00  
BenefitCost  969.28  

--- Outlier Check Using IQR ---
BenefitID: No significant outliers detected
BenefitCost: No significant outliers detected

--- Categorical Columns Summary ---

Column: BenefitType
BenefitType
Retirement Plan              6
Health Insurance             4
Tuition Reimbursement        4
Gym Membership               4
Life Insurance               4
Childcare                    2
Commuter Benefits            1
Technology Stipend           1
Flexibl

### 1.3 Findings and Solutions

For any duplicate values, it would be best to merge the datasets and then drop duplicates to make sure there are only the unique rows based off of both BenefitsID and EmployeeID. 

For any numerical missing values, we can replace them with the median or mean, depending on the value within that column.

For any categorical missing values, we can replace them with the most frequenctly observed value within that column.

### 1.4 Define Validation Rules

SatisfactionScore: integers from 1 - 5

LastUsedDate: 2023 - 2025

UsageFrequency: >= 0

Age: >= 18

** Summary **
This preceding code loads and cleans the dataset by checking for any existing issues like missing values, duplicates, and outliers. If and when found, it removes and adjust the values as needed. It also provides potential solutions for existing issues and defines the validation rules for the cleaned dataset.

## 2. Data Integration and Preparation

### 2.1 Merge Datasets using EmployeeID and BenefitsID

In [19]:
usage_employee = pd.merge(usage_data, employee_data, on='EmployeeID', how='inner')
on_ben = pd.merge(usage_employee, benefits_data, on='BenefitID', how='inner')
merged = pd.merge(on_ben, feedback_data, on=['EmployeeID', 'BenefitID'], how='inner')

### 2.2 Handle Missing Values   !!!!!!!!!!!!!!

In [21]:
#replace missing in UsageFrequency with median
merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)

#drop missing comments
merged.dropna(subset=['Comments'], inplace=True)

#replace null categorical values with the most frequently occuring category
for col in merged.select_dtypes(include=['object', 'category']).columns:
    merged[col].fillna(merged[col].mode()[0] , inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged['UsageFrequency'].fillna(merged['UsageFrequency'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged[col].fillna(merged[col].mode()[0] , inplace=True)


### 2.3 Standardize Data Types

In [22]:
#LastUsedDate --> datetime
#Gender/Department

merged['LastUsedDate'] = pd.to_datetime(merged['LastUsedDate'], errors='coerce')
merged['Gender'] = merged['Gender'].astype('category')
merged['Department'] = merged['Department'].astype('category')
merged


Unnamed: 0,EmployeeID,BenefitID,UsageFrequency,LastUsedDate,Age,Gender,Department,Tenure,BenefitType,BenefitSubType,BenefitCost,SatisfactionScore,Comments
0,220,20,4,2024-05-03,64,Male,HR,35,Tuition Reimbursement,Undergraduate Degree,489.96,1,Not worth the hassle.
1,1820,26,1,2024-02-08,53,Male,Finance,2,Gym Membership,Family Membership,519.66,2,Not many locations available.
2,285,16,2,2023-10-27,64,Male,Marketing,35,Health Insurance,HDHP Individual,84.55,3,"Satisfactory, but could improve."
3,4536,8,8,2024-07-03,32,Female,Sales,10,Wellness Programs,Premium Discount Tier 1,125.00,1,Barely any time off given.
4,1262,12,3,2024-04-13,42,Male,Finance,1,Tuition Reimbursement,Graduate Degree,824.53,3,Helps but limited in scope.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9827,3106,25,0,2023-11-23,33,Male,IT,12,Life Insurance,Supplemental Standard,774.91,1,Disappointing service.
9828,2588,28,5,2024-07-23,32,Male,IT,5,Life Insurance,Dependent Coverage,165.54,4,"Solid plan, happy with it."
9829,533,28,2,2024-01-23,56,Female,Finance,34,Life Insurance,Dependent Coverage,165.54,2,Coverage is minimal.
9830,4288,20,4,2024-04-30,49,Female,Finance,13,Tuition Reimbursement,Undergraduate Degree,489.96,2,Complicated reimbursement process.


### 2.4 Create Derived Fields

In [23]:
merged['age_group'] = pd.cut(merged['Age'], bins=[17, 25, 41, 57, np.inf], labels=['Gen Z', 'Millennial', 'Gen X', 'Boomer'])

merged['tenure_group'] = pd.cut(merged['Tenure'], bins=[0, 5, 15, np.inf], labels=['<5', '5-15', '>15'])

subcat_dummies = pd.get_dummies(merged['BenefitSubType'], prefix='subcat')
merged = pd.concat([merged, subcat_dummies], axis=1)

merged.columns

Index(['EmployeeID', 'BenefitID', 'UsageFrequency', 'LastUsedDate', 'Age',
       'Gender', 'Department', 'Tenure', 'BenefitType', 'BenefitSubType',
       'BenefitCost', 'SatisfactionScore', 'Comments', 'age_group',
       'tenure_group', 'subcat_401k Basic Matching',
       'subcat_401k Catch-Up Contributions', 'subcat_401k High Contribution',
       'subcat_401k Investment Fees', 'subcat_401k Maximum Matching',
       'subcat_401k Standard Matching', 'subcat_After-School Care',
       'subcat_Basic Coverage', 'subcat_Conference Attendance',
       'subcat_Dependent Coverage', 'subcat_Family Membership',
       'subcat_Graduate Degree', 'subcat_HDHP Individual', 'subcat_HMO Family',
       'subcat_Healthcare FSA', 'subcat_Individual Courses',
       'subcat_Monthly Communications', 'subcat_Monthly Internet Allowance',
       'subcat_On-Site Infant Care', 'subcat_PPO Family',
       'subcat_PPO Individual', 'subcat_Premium Discount Tier 1',
       'subcat_Professional Certification', 

### 2.5 Validate the Dataset

In [26]:
#drop duplicates based on employee and benefit id
merged = merged.drop_duplicates(subset=['EmployeeID', 'BenefitID'], keep='first')

#no neg usageFrequency values
merged = merged[merged['UsageFrequency'] >= 0]

#satisfactionScore in between 1 and 5
merged = merged[(merged['SatisfactionScore'] >=1) & (merged['SatisfactionScore'] <=5)]

# of age employees
merged = merged[merged['Age'] >= 18]

merged

Unnamed: 0,EmployeeID,BenefitID,UsageFrequency,LastUsedDate,Age,Gender,Department,Tenure,BenefitType,BenefitSubType,...,subcat_PPO Individual,subcat_Premium Discount Tier 1,subcat_Professional Certification,subcat_Supplemental High Amount,subcat_Supplemental Standard,subcat_Tier 1 Partners,subcat_Tier 2 Partners,subcat_Tier 3 Partners,subcat_Transit Subsidy,subcat_Undergraduate Degree
0,220,20,4,2024-05-03,64,Male,HR,35,Tuition Reimbursement,Undergraduate Degree,...,False,False,False,False,False,False,False,False,False,True
1,1820,26,1,2024-02-08,53,Male,Finance,2,Gym Membership,Family Membership,...,False,False,False,False,False,False,False,False,False,False
2,285,16,2,2023-10-27,64,Male,Marketing,35,Health Insurance,HDHP Individual,...,False,False,False,False,False,False,False,False,False,False
3,4536,8,8,2024-07-03,32,Female,Sales,10,Wellness Programs,Premium Discount Tier 1,...,False,True,False,False,False,False,False,False,False,False
4,1262,12,3,2024-04-13,42,Male,Finance,1,Tuition Reimbursement,Graduate Degree,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9827,3106,25,0,2023-11-23,33,Male,IT,12,Life Insurance,Supplemental Standard,...,False,False,False,False,True,False,False,False,False,False
9828,2588,28,5,2024-07-23,32,Male,IT,5,Life Insurance,Dependent Coverage,...,False,False,False,False,False,False,False,False,False,False
9829,533,28,2,2024-01-23,56,Female,Finance,34,Life Insurance,Dependent Coverage,...,False,False,False,False,False,False,False,False,False,False
9830,4288,20,4,2024-04-30,49,Female,Finance,13,Tuition Reimbursement,Undergraduate Degree,...,False,False,False,False,False,False,False,False,False,True


### 2.6 Export Cleaned Data as CSV

In [None]:
merged.to_csv('data/cleaned_data.csv', index=False)