In [5]:
import pandas as pd
import faker
import random

# Initialize faker to generate dummy data
fake = faker.Faker()

# Number of rows of data
num_rows = 5000

# Create a dictionary to store data
data = {
    'Employee ID': [fake.uuid4() for _ in range(num_rows)],
    'First Name': [fake.first_name() for _ in range(num_rows)],
    'Middle Name': [fake.first_name() for _ in range(num_rows)],
    'Last Name': [fake.last_name() for _ in range(num_rows)],
    'Date of Birth': [fake.date_of_birth(minimum_age=18, maximum_age=65).strftime('%Y-%m-%d') for _ in range(num_rows)],
    'Gender': [fake.random_element(elements=('Male', 'Female', 'Other')) for _ in range(num_rows)],
    'Email Address': [fake.email() for _ in range(num_rows)],
    'Phone Number': [fake.phone_number() for _ in range(num_rows)],
    'Address': [fake.address() for _ in range(num_rows)],
    'City': [fake.city() for _ in range(num_rows)],
    'State/Province': [fake.state() for _ in range(num_rows)],
    'Zip Code/Postal Code': [fake.zipcode() for _ in range(num_rows)],
    'Country': [fake.country() for _ in range(num_rows)],
    'Job Title': [fake.job() for _ in range(num_rows)],
    'Department': [fake.random_element(elements=('IT', 'HR', 'Finance', 'Marketing')) for _ in range(num_rows)],
    'Manager ID': [fake.uuid4() for _ in range(num_rows)],
    'Hire Date': [fake.date_this_decade().strftime('%Y-%m-%d') for _ in range(num_rows)],
    'Termination Date': [fake.date_between(start_date='-1y', end_date='today').strftime('%Y-%m-%d') if fake.boolean(chance_of_getting_true=20) else None for _ in range(num_rows)],
    'Salary': [fake.random_number(digits=5) for _ in range(num_rows)],
    'Hourly Rate': [fake.random_number(digits=2) for _ in range(num_rows)],
    'Contract Type': [fake.random_element(elements=('Full-time', 'Part-time', 'Contract')) for _ in range(num_rows)],
    'Contract Start Date': [fake.date_this_decade().strftime('%Y-%m-%d') for _ in range(num_rows)],
    'Contract End Date': [fake.date_between(start_date='today', end_date='+2y').strftime('%Y-%m-%d') for _ in range(num_rows)],
    'Skills': [fake.words(nb=3) for _ in range(num_rows)],
    'Certifications': [fake.random_element(elements=('CCNA', 'AWS', 'Microsoft Certified', 'Cisco Certified')) for _ in range(num_rows)],
    'Education': [fake.random_element(elements=('High School', 'Bachelor', 'Master', 'PhD')) for _ in range(num_rows)]
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

def exchange_emails_phone_numbers(df, num_rows_to_exchange):
    # Randomly pick rows to exchange emails with phone numbers
    rows_to_exchange = random.sample(range(num_rows), num_rows_to_exchange)
    for row in rows_to_exchange:
        temp_email = df.at[row, 'Email Address']
        temp_phone = df.at[row, 'Phone Number']
        df.at[row, 'Email Address'] = temp_phone
        df.at[row, 'Phone Number'] = temp_email

def combine_state_zip(df, num_rows_to_combine):
    # Randomly pick rows to combine state with zip code
    rows_to_combine = random.sample(range(num_rows), num_rows_to_combine)
    for row in rows_to_combine:
        df.at[row, 'State/Province'] = df.at[row, 'State/Province'] + ' ' + df.at[row, 'Zip Code/Postal Code']
        df.at[row, 'Zip Code/Postal Code'] = ''

def combine_middle_name_first_name(df, num_rows_to_combine):
    # Randomly pick rows to combine first name and middle name
    rows_to_combine = random.sample(range(num_rows), num_rows_to_combine)
    for row in rows_to_combine:
        # Combine middle name and first name, leave middle name blank
        df.at[row, 'First Name'] = df.at[row, 'First Name'] + ' ' + df.at[row, 'Middle Name']
        df.at[row, 'Middle Name'] = ''

# Randomly combine middle name and first name in 10 rows
combine_middle_name_first_name(df, 3193)

# Exchange emails with phone numbers in 10 random rows
exchange_emails_phone_numbers(df, 1785)

# Combine state with zip code in 5 random rows
combine_state_zip(df, 1027)

# Save DataFrame to CSV file
df.to_csv('IT_staffing_dataset_updated.csv', index=False)
