In [4]:
!pip install faker



Collecting faker
  Downloading Faker-25.8.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-25.8.0


In [6]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
import io

# Load the uploaded file into a DataFrame
employees = pd.read_csv(io.BytesIO(uploaded['employees.csv']))


Saving employees.csv to employees.csv


In [7]:
print(employees.head())

  First Name Last Name                              Email  \
0       Jose     Lopez     joselopez0944@slingacademy.com   
1      Diane    Carter   dianecarter1228@slingacademy.com   
2      Shawn    Foster   shawnfoster2695@slingacademy.com   
3     Brenda    Fisher  brendafisher3185@slingacademy.com   
4       Sean    Hunter    seanhunter4753@slingacademy.com   

                  Phone  Gender  Age                  Job Title  \
0  +1-971-533-4552x1542    male   25            Project Manager   
1          881.633.0107  female   26  Machine Learning Engineer   
2  001-966-861-0065x493    male   37            Project Manager   
3      001-574-564-4648  female   31              Web Developer   
4            5838355842    male   35            Project Manager   

   Years Of Experience  Salary Department  
0                    1    8500    Product  
1                    2    7000    Product  
2                   14   17000    Product  
3                    8   10000    Product  
4         

In [8]:
fake = Faker()

#functions to generate data
def generate_employee_data(num_records):
    data = []
    for employee_id in range(1, num_records + 1):
        gender = random.choice(['Male', 'Female'])
        citizenship = random.choices(
            ['USA', 'India', 'China', 'Canada', 'South Korea', 'Philippines', 'Taiwan', 'Mexico'],
            weights=[0.6, 0.16, 0.11, 0.05, 0.03, 0.025, 0.015, 0.01],
            k=1
        )[0]
        languages = random.sample(['Spanish', 'French', 'Mandarin', 'Hindi', 'Arabic'], k=random.randint(0, 2))
        departments = ['Legal', 'Marketing', 'Administrative', 'Operations', 'Sales', 'Finance', 'I/T', 'Product', 'Human Resource']
        department = random.choices(departments, weights=[0.05, 0.1, 0.1, 0.2, 0.1, 0.05, 0.1, 0.2, 0.1], k=1)[0]
        salary = random.randint(50000, 150000)
        ssn = fake.ssn()
        age = random.randint(22, 65)
        years_experience = max(0, age - 21)
        data.append({
            'EmployeeID': employee_id,
            'Name': fake.name(),
            'Gender': gender,
            'Citizenship': citizenship,
            'Languages': ', '.join(languages) if languages else 'None',
            'Department': department,
            'Salary': salary,
            'SSN': ssn,
            'Age': age,
            'Years of Experience': years_experience
        })
    return pd.DataFrame(data)

# Generate synthetic data
synthetic_data = generate_employee_data(10000)

# Save to CSV
synthetic_data.to_csv('new_employees.csv', index=False)

print("Synthetic dataset created and saved to 'new_employees.csv'")

# Download the generated CSV file
files.download('new_employees.csv')

Synthetic dataset created and saved to 'new_employees.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Part C

In [11]:
import pandas as pd

# Load the synthetic data
synthetic_data = pd.read_csv('new_employees.csv')

# 1. Gender Balance Analysis
# Group by department and gender
department_gender_counts = synthetic_data.groupby(['Department', 'Gender']).size().unstack(fill_value=0)

# Display the counts
print("Gender distribution in each department:")
print(department_gender_counts)

# Calculate the hiring needs
print("\nHiring needs to balance gender ratio:")
for department in department_gender_counts.index:
    males = department_gender_counts.loc[department, 'Male']
    females = department_gender_counts.loc[department, 'Female']
    total = males + females
    needed_males = total // 2 - males
    needed_females = total // 2 - females

    if needed_males > 0:
        print(f"Department: {department}, Need to hire {needed_males} more males to balance")
    elif needed_females > 0:
        print(f"Department: {department}, Need to hire {needed_females} more females to balance")
    else:
        print(f"Department: {department} is balanced")

# 2. Total Yearly Payroll
total_payroll = synthetic_data['Salary'].sum()
print(f"\nTotal Yearly Payroll: ${total_payroll}")

# 3. Growth Strategies
print("\nGrowth Strategies other than hiring from non-US countries:")
print("- Acquisitions: Acquire smaller companies to quickly increase workforce and capabilities.")
print("- Remote Workforce: Hire remote employees to expand talent pool without geographical constraints.")
print("- Internship Programs: Establish strong internship and co-op programs to develop future full-time employees.")
print("- Automation and Technology: Invest in automation and technology to improve efficiency and reduce the need for additional employees.")

# 4. Office Space Requirement
num_employees = len(synthetic_data)
office_space_per_employee = 150  # square feet per employee
total_office_space = num_employees * office_space_per_employee
print(f"\nEstimated Office Space Required: {total_office_space} square feet")

# 5. Privacy Preservation
# Load original data
original_data = pd.read_csv('employees.csv')

# Check for any matches between original and synthetic data
matching_rows = synthetic_data[synthetic_data.isin(original_data)].dropna()
print(f"\nNumber of matching rows with original data: {len(matching_rows)}")

# Visual Inspection: Manually inspect a few rows from both datasets
print("\nFirst few rows of the original dataset:")
print(original_data.head())

print("\nFirst few rows of the synthetic dataset:")
print(synthetic_data.head())


Gender distribution in each department:
Gender          Female  Male
Department                  
Administrative     489   499
Finance            257   235
Human Resource     530   501
I/T                504   479
Legal              242   254
Marketing          544   495
Operations         972  1022
Product            980  1012
Sales              490   495

Hiring needs to balance gender ratio:
Department: Administrative, Need to hire 5 more females to balance
Department: Finance, Need to hire 11 more males to balance
Department: Human Resource, Need to hire 14 more males to balance
Department: I/T, Need to hire 12 more males to balance
Department: Legal, Need to hire 6 more females to balance
Department: Marketing, Need to hire 24 more males to balance
Department: Operations, Need to hire 25 more females to balance
Department: Product, Need to hire 16 more females to balance
Department: Sales, Need to hire 2 more females to balance

Total Yearly Payroll: $1001783743

Growth Strategies