In [None]:
import pandas as pd
import os

eda = pd.read_csv('raw_glassdoor_data/eda_data.csv')



In [None]:
company_cols = [
    'Company Name', 'Headquarters', 'Size', 'Founded', 
    'Type of ownership', 'Industry', 'Sector', 'Revenue', 
    'Competitors', 'company_txt', 'Rating'
]

companies_df = eda[company_cols].drop_duplicates(subset=['Company Name']).copy()
companies_df.reset_index(drop=True, inplace=True)
companies_df['company_id'] = companies_df.index + 1

companies_df = companies_df[['company_id'] + company_cols]

companies_df.drop(columns=['Company Name'], inplace=True)
companies_df.rename(columns={'company_txt': 'Company Name'}, inplace=True)
companies_df.rename(columns={'Rating': 'Company Rating'}, inplace=True)


In [None]:
import pandas as pd

#remove any appended ratings
eda['clean_company_name'] = eda['Company Name'].apply(lambda x: x.split("\n")[0].strip())

#variations in industry column, can one company have multiple industries?
industries = eda.groupby('clean_company_name')['Industry'].nunique().reset_index(name='unique_industries')
companies_with_many_industries = industries[industries['unique_industries'] > 1]
print(companies_with_many_industries)

#can one company have multiple sectors?
sectors = eda.groupby('clean_company_name')['Sector'].nunique().reset_index(name='unique_sectors')
companies_with_many_sectors = sectors[sectors['unique_sectors'] > 1]
print(companies_with_many_sectors)


Companies with more than one unique Industry:
Empty DataFrame
Columns: [clean_company_name, unique_industries]
Index: []

Companies with more than one unique Sector:
Empty DataFrame
Columns: [clean_company_name, unique_sectors]
Index: []


In [None]:
columns = [
    'Job Title', 'Salary Estimate', 'Job Description', 
    'hourly', 'employer_provided',
    'min_salary', 'max_salary', 'avg_salary', 'age',
    'python_yn', 'R_yn', 'spark', 'aws', 'excel', 
    'job_simp', 'seniority', 'num_comp'
]

jobs_df = eda[columns].copy()

jobs_df['Company Name'] = eda['company_txt']

#merge jobs_df with companies_df to get the corresponding company_id.
#companies_df has been built using the clean company name in Company Name column
jobs_df = jobs_df.merge(companies_df[['company_id', 'Company Name']], on='Company Name', how='left')


jobs_df.drop(columns=['Company Name'], inplace=True)

#create job id starting from 1
jobs_df['job_id'] = range(1, len(jobs_df) + 1)

#rearranging columns to have job_id as the first column
cols = ['job_id'] + [col for col in jobs_df.columns if col != 'job_id']
jobs_df = jobs_df[cols]

In [None]:

#remove salary estimate column
jobs_df.drop(columns=['Salary Estimate'], inplace=True)


hourly = jobs_df['hourly'] == 1
others = jobs_df['hourly'] == 0

#convert hourly rate to annual salary by multiplying by 40*52 (2,080)
jobs_df.loc[hourly, 'min_salary'] = jobs_df.loc[hourly, 'min_salary'] * (40 * 52)
jobs_df.loc[hourly, 'max_salary'] = jobs_df.loc[hourly, 'max_salary'] * (40 * 52)
jobs_df.loc[hourly, 'avg_salary'] = jobs_df.loc[hourly, 'avg_salary'] * (40 * 52)

#for jobs that are not hourly, they are already in thousands, ex: 120 represents 120,000. we need to conver them multiplying by 1,000
jobs_df.loc[others, 'min_salary'] = jobs_df.loc[others, 'min_salary'] * 1000
jobs_df.loc[others, 'max_salary'] = jobs_df.loc[others, 'max_salary'] * 1000
jobs_df.loc[others, 'avg_salary'] = jobs_df.loc[others, 'avg_salary'] * 1000

#drop the hourly column as it is no longer needed
jobs_df.drop(columns=['hourly'], inplace=True)



In [None]:
#rename the columns to be more descriptive
jobs_df.rename(columns={
    'python_yn': 'Python',
    'R_yn': 'R',
    'spark': 'Spark',
    'aws': 'AWS',
    'excel': 'Excel'
}, inplace=True)




   job_id                  Job Title  \
0       1             Data Scientist   
1       2  Healthcare Data Scientist   
2       3             Data Scientist   
3       4             Data Scientist   
4       5             Data Scientist   

                                     Job Description  employer_provided  \
0  Data Scientist\nLocation: Albuquerque, NM\nEdu...                  0   
1  What You Will Do:\n\nI. General Summary\n\nThe...                  0   
2  KnowBe4, Inc. is a high growth information sec...                  0   
3  *Organization and Job ID**\nJob ID: 310709\n\n...                  0   
4  Data Scientist\nAffinity Solutions / Marketing...                  0   

   min_salary  max_salary  avg_salary  age  Python  R  Spark  AWS  Excel  \
0       53000       91000     72000.0   47       1  0      0    0      1   
1       63000      112000     87500.0   36       1  0      0    0      0   
2       80000       90000     85000.0   10       1  0      1    0      1   
3   

In [None]:
inflation_factor = 1.314  #for 2017->2025

#use inflation factor to adjust salaries
jobs_df['min_salary'] = jobs_df['min_salary'] * inflation_factor
jobs_df['max_salary'] = jobs_df['max_salary'] * inflation_factor
jobs_df['avg_salary'] = jobs_df['avg_salary'] * inflation_factor


In [None]:

columns_order = [
    'job_id', 
    'Job Title', 
    'Job Description', 
    'seniority', 
    'company_id', 
    'age', 
    'Python', 
    'R', 
    'Spark', 
    'AWS', 
    'Excel', 
    'min_salary', 
    'max_salary', 
    'avg_salary', 
    'employer_provided',
    'num_comp'
]

# Reorder the DataFrame accordingly
jobs_df = jobs_df[columns_order]



   job_id                  Job Title  \
0       1             Data Scientist   
1       2  Healthcare Data Scientist   
2       3             Data Scientist   
3       4             Data Scientist   
4       5             Data Scientist   

                                     Job Description seniority  company_id  \
0  Data Scientist\nLocation: Albuquerque, NM\nEdu...        na           1   
1  What You Will Do:\n\nI. General Summary\n\nThe...        na           2   
2  KnowBe4, Inc. is a high growth information sec...        na           3   
3  *Organization and Job ID**\nJob ID: 310709\n\n...        na           4   
4  Data Scientist\nAffinity Solutions / Marketing...        na           5   

   age  Python  R  Spark  AWS  Excel  min_salary  max_salary  avg_salary  \
0   47       1  0      0    0      1     69642.0    119574.0     94608.0   
1   36       1  0      0    0      0     82782.0    147168.0    114975.0   
2   10       1  0      1    0      1    105120.0    118260.0  

In [None]:
jobs_df.to_csv("glassdoor_jobs.csv", index=False)

companies_df.to_csv("glassdoor_companies.csv", index=False)
