In [33]:
import pandas as pd
import os

eda_df = pd.read_csv('raw_glassdoor_data/eda_data.csv')



In [34]:
company_cols = [
    'Company Name', 'Headquarters', 'Size', 'Founded', 
    'Type of ownership', 'Industry', 'Sector', 'Revenue', 
    'Competitors', 'company_txt', 'Rating'
]

companies_df = eda_df[company_cols].drop_duplicates(subset=['Company Name']).copy()
companies_df.reset_index(drop=True, inplace=True)
companies_df['company_id'] = companies_df.index + 1

companies_df = companies_df[['company_id'] + company_cols]

companies_df.drop(columns=['Company Name'], inplace=True)
companies_df.rename(columns={'company_txt': 'Company Name'}, inplace=True)
companies_df.rename(columns={'Rating': 'Company Rating'}, inplace=True)


In [35]:
import pandas as pd

# Create a cleaned version of the company name by removing any appended rating.
eda_df['clean_company_name'] = eda_df['Company Name'].apply(lambda x: x.split("\n")[0].strip())

# Check for variation in the 'Industry' column.
industry_variation = eda_df.groupby('clean_company_name')['Industry'].nunique().reset_index(name='unique_industries')
companies_with_multiple_industries = industry_variation[industry_variation['unique_industries'] > 1]
print("Companies with more than one unique Industry:")
print(companies_with_multiple_industries)

# Check for variation in the 'Sector' column.
sector_variation = eda_df.groupby('clean_company_name')['Sector'].nunique().reset_index(name='unique_sectors')
companies_with_multiple_sectors = sector_variation[sector_variation['unique_sectors'] > 1]
print("\nCompanies with more than one unique Sector:")
print(companies_with_multiple_sectors)


Companies with more than one unique Industry:
Empty DataFrame
Columns: [clean_company_name, unique_industries]
Index: []

Companies with more than one unique Sector:
Empty DataFrame
Columns: [clean_company_name, unique_sectors]
Index: []


In [36]:
job_cols = [
    'Job Title', 'Salary Estimate', 'Job Description', 
    'hourly', 'employer_provided',
    'min_salary', 'max_salary', 'avg_salary', 'age',
    'python_yn', 'R_yn', 'spark', 'aws', 'excel', 
    'job_simp', 'seniority', 'num_comp'
]

# Create the jobs DataFrame from eda_df, using the selected job columns.
jobs_df = eda_df[job_cols].copy()

# Add the clean Company Name from the 'company_txt' column, which you'll use for merging.
jobs_df['Company Name'] = eda_df['company_txt']

# Merge jobs_df with companies_df to get the corresponding company_id.
# companies_df has been built using the clean company name from 'company_txt', renamed as 'Company Name'
jobs_df = jobs_df.merge(companies_df[['company_id', 'Company Name']], on='Company Name', how='left')

# Optionally, if you don't need the Company Name in jobs_df now that you have company_id, drop it:
jobs_df.drop(columns=['Company Name'], inplace=True)

# Create a unique job_id for each job (starting at 1)
jobs_df['job_id'] = range(1, len(jobs_df) + 1)

# Reorder the DataFrame to have job_id as the first column
cols = ['job_id'] + [col for col in jobs_df.columns if col != 'job_id']
jobs_df = jobs_df[cols]

In [37]:

# Remove the "Salary Estimate" column
jobs_df.drop(columns=['Salary Estimate'], inplace=True)

# Create masks for hourly and non-hourly jobs
hourly_mask = jobs_df['hourly'] == 1
non_hourly_mask = jobs_df['hourly'] == 0

# For hourly jobs: convert hourly rate to annual salary by multiplying by 40*52 (2,080)
jobs_df.loc[hourly_mask, 'min_salary'] = jobs_df.loc[hourly_mask, 'min_salary'] * (40 * 52)
jobs_df.loc[hourly_mask, 'max_salary'] = jobs_df.loc[hourly_mask, 'max_salary'] * (40 * 52)
jobs_df.loc[hourly_mask, 'avg_salary'] = jobs_df.loc[hourly_mask, 'avg_salary'] * (40 * 52)

# For non-hourly jobs: convert from thousands to full dollars by multiplying by 1,000
jobs_df.loc[non_hourly_mask, 'min_salary'] = jobs_df.loc[non_hourly_mask, 'min_salary'] * 1000
jobs_df.loc[non_hourly_mask, 'max_salary'] = jobs_df.loc[non_hourly_mask, 'max_salary'] * 1000
jobs_df.loc[non_hourly_mask, 'avg_salary'] = jobs_df.loc[non_hourly_mask, 'avg_salary'] * 1000

# Drop the "hourly" column as it's no longer needed
jobs_df.drop(columns=['hourly'], inplace=True)



In [38]:
# Rename the columns in jobs_df
jobs_df.rename(columns={
    'python_yn': 'Python',
    'R_yn': 'R',
    'spark': 'Spark',
    'aws': 'AWS',
    'excel': 'Excel'
}, inplace=True)

# Verify the changes
print(jobs_df.head())


   job_id                  Job Title  \
0       1             Data Scientist   
1       2  Healthcare Data Scientist   
2       3             Data Scientist   
3       4             Data Scientist   
4       5             Data Scientist   

                                     Job Description  employer_provided  \
0  Data Scientist\nLocation: Albuquerque, NM\nEdu...                  0   
1  What You Will Do:\n\nI. General Summary\n\nThe...                  0   
2  KnowBe4, Inc. is a high growth information sec...                  0   
3  *Organization and Job ID**\nJob ID: 310709\n\n...                  0   
4  Data Scientist\nAffinity Solutions / Marketing...                  0   

   min_salary  max_salary  avg_salary  age  Python  R  Spark  AWS  Excel  \
0       53000       91000     72000.0   47       1  0      0    0      1   
1       63000      112000     87500.0   36       1  0      0    0      0   
2       80000       90000     85000.0   10       1  0      1    0      1   
3   

In [39]:
inflation_factor = 1.314  # from your calculation

# Apply the inflation factor
jobs_df['min_salary'] = jobs_df['min_salary'] * inflation_factor
jobs_df['max_salary'] = jobs_df['max_salary'] * inflation_factor
jobs_df['avg_salary'] = jobs_df['avg_salary'] * inflation_factor


In [40]:
# Define the desired column order
desired_order = [
    'job_id', 
    'Job Title', 
    'Job Description', 
    'seniority', 
    'company_id', 
    'age', 
    'Python', 
    'R', 
    'Spark', 
    'AWS', 
    'Excel', 
    'min_salary', 
    'max_salary', 
    'avg_salary', 
    'employer_provided',
    'num_comp'
]

# Reorder the DataFrame accordingly
jobs_df = jobs_df[desired_order]

# Verify the new order of columns
print(jobs_df.head())


   job_id                  Job Title  \
0       1             Data Scientist   
1       2  Healthcare Data Scientist   
2       3             Data Scientist   
3       4             Data Scientist   
4       5             Data Scientist   

                                     Job Description seniority  company_id  \
0  Data Scientist\nLocation: Albuquerque, NM\nEdu...        na           1   
1  What You Will Do:\n\nI. General Summary\n\nThe...        na           2   
2  KnowBe4, Inc. is a high growth information sec...        na           3   
3  *Organization and Job ID**\nJob ID: 310709\n\n...        na           4   
4  Data Scientist\nAffinity Solutions / Marketing...        na           5   

   age  Python  R  Spark  AWS  Excel  min_salary  max_salary  avg_salary  \
0   47       1  0      0    0      1     69642.0    119574.0     94608.0   
1   36       1  0      0    0      0     82782.0    147168.0    114975.0   
2   10       1  0      1    0      1    105120.0    118260.0  

In [41]:
# Save the jobs table to glassdoor_jobs.csv without the index
jobs_df.to_csv("glassdoor_jobs.csv", index=False)

# Save the companies table to glassdoor_companies.csv without the index
companies_df.to_csv("glassdoor_companies.csv", index=False)
