# Code: Data Simulation
## Sasha Lawson

### Section 1: Setup

In [1]:
#Import statements needed.
import pandas as pd
import numpy as np
import random

In [2]:
#Create a function to simulate data across various columns. "rows" being the number of rows.
def simulate_data(rows = 5000):
    
    #Define the various fake companies.
    companies = ['Company A', 'Company B', 'Company C', 'Company D', 'Company E', 'Company F', 'Company G', 'Company H']
    
    #Define the various roles that should be included.
    roles = ['Software Engineer', 'Project Manager', 'HR Specialist', 'Accountant', 'Marketing Manager', 
             'Customer Support', 'Operations Manager', 'Product Designer', 'Business Analyst', 
             'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Sales Executive', 'Legal Advisor']
    
    #Define the various industries that should be included.
    industries = ['Tech', 'Healthcare', 'Finance', 'Education', 'Retail', 'Manufacturing', 'Telecommunications', 'Energy']
    
    #Define the various locations that should be included.
    locations = ['New York', 'San Francisco', 'Austin', 'Remote', 'Seattle', 'Chicago', 'Boston', 'London']
    
    #Define the various location sizes that should be included.
    company_sizes = ['1-10', '11-50', '51-250', '251-500', '501+']
    
    
    #Create a function to create the simulated satisfcation scores.
    def satisfaction_scores():
        
        #Return the various categoies with a randomly assigned score.
        return {
            'Overall': random.randint(50, 100),
            'Work_Life_Balance': random.randint(40, 100),
            'Career_Growth': random.randint(30, 90),
            'Compensation': random.randint(50, 100),
            'Leadership': random.randint(40, 90),
            'Colleagues': random.randint(60, 100)
        }
    
    
    #Create the dataset.
    data = []
    
    
    #Loop through the number of defined "rows".
    for index in range(1, rows + 1): 
    
        #Get the randomized satisfaction.
        satisfaction = satisfaction_scores()
        
        #Get the randomly assigned data for the current "row".
        row = {
            'Index': index,
            
            'Company': random.choice(companies),
            'Role': random.choice(roles),
            'Industry': random.choice(industries),
            'Location': random.choice(locations),
            'Company_Size': random.choice(company_sizes),
            
            'Overall': satisfaction['Overall'],
            'Work_Life_Balance': satisfaction['Work_Life_Balance'],
            'Career_Growth': satisfaction['Career_Growth'],
            'Compensation': satisfaction['Compensation'],
            'Leadership': satisfaction['Leadership'],
            'Colleagues': satisfaction['Colleagues']
        }
        
        #Add the current "row".
        data.append(row)
    
    
    return pd.DataFrame(data)

In [3]:
#Create a function to define role descriptions.
def role_descriptions():
    
    #Define role descriptions in a dictionary.
    role_descriptions = {
        'Software Engineer': 'Develops, tests, and maintains software applications.',
        'Project Manager': 'Oversees projects, ensuring they are completed on time and within scope.',
        'HR Specialist': 'Handles recruitment, employee relations, and HR administration.',
        'Accountant': 'Manages financial records and ensures compliance with regulations.',
        'Marketing Manager': 'Plans and executes marketing strategies to promote products or services.',
        'Customer Support': 'Provides assistance to customers, resolves issues, and answers inquiries.',
        'Operations Manager': 'Ensures smooth day-to-day operations and process optimization.',
        'Product Designer': 'Creates user-centered designs for products, focusing on functionality and aesthetics.',
        'Business Analyst': 'Analyzes business processes and provides data-driven insights for decision-making.',
        'Data Scientist': 'Analyzes complex data to extract insights and build predictive models.',
        'Data Analyst': 'Interprets data to help businesses make informed decisions.',
        'Machine Learning Engineer': 'Builds machine learning models and systems that learn from data.',
        'Sales Executive': 'Drives sales growth by building relationships and closing deals.',
        'Legal Advisor': 'Provides legal advice to ensure compliance with laws and regulations.'
    }
    

    #Return the converted dictionary as a dataframe.
    return pd.DataFrame(list(role_descriptions.items()), columns=['Role_Title', 'Role_Description'])

In [4]:
#Create a function to simluate benchmark data based on the passed in dataframe.
def simulate_benchmark(df):
    
    #Calculate mean scores for each role based on categories.
    benchmark = (
        df[['Role'] + ['Overall', 'Work_Life_Balance', 'Career_Growth', 'Compensation', 'Leadership', 'Colleagues']]
        .groupby(['Role'])
        .mean()
        .reset_index()
    )
    
    #Rename the benchmark columns as "[column]_Benchmark".
    benchmark.rename(
        columns={col: f'{col}_Benchmark' for col in benchmark.columns},
        inplace=True
    )
    
    
    return benchmark

### Section 2: Implementation

In [5]:
#Implement methods to create the datasets.
role_data = role_descriptions() 
simulated_data = simulate_data()
benchmark_data = simulate_benchmark(simulated_data)


#Export simluated data.
simulated_data.to_excel('Simulated_Data.xlsx', index = False, engine = 'openpyxl')

#Export simluated data.
role_data.to_excel('Role_Data.xlsx', index = False, engine = 'openpyxl')

#Export benchmark data.
benchmark_data.to_excel('Benchmark_Data.xlsx', index = False, engine = 'openpyxl')