In [None]:
import pandas as pd
import kagglehub
import os


In [None]:
companies = pd.read_csv('raw_linkedin_data/companies/companies.csv')
employee_counts = pd.read_csv('raw_linkedin_data/companies/employee_counts.csv')
industries = pd.read_csv('raw_linkedin_data/companies/company_industries.csv')
specialities = pd.read_csv('raw_linkedin_data/companies/company_specialities.csv')
industries_ids = pd.read_csv('raw_linkedin_data/mappings/industries.csv')

In [None]:
#did a simple check to see if there are duplicates in the data regarding industries
#this is important because if there are duplicates, we need to process them
#in order to avoid any errors in the data

companies_has_duplicates = companies['company_id'].duplicated().any()
print("Duplicates present in companies.csv:", companies_has_duplicates)

employee_counts_has_duplicates = employee_counts['company_id'].duplicated().any()
print("Duplicates present in employee_counts.csv:", employee_counts_has_duplicates)

industry_has_duplicates = industries['company_id'].duplicated().any()
print("Duplicates present in company_industries.csv:", industry_has_duplicates)

specialities_has_duplicates = specialities['company_id'].duplicated().any()
print("Duplicates present in company_specialities.csv:", specialities_has_duplicates)


In [None]:
employee_counts['time_recorded'] = pd.to_datetime(employee_counts['time_recorded'])
sorted_employee_counts = employee_counts.sort_values(['company_id', 'time_recorded'], ascending=[True, False])
unique_employee_counts = sorted_employee_counts.drop_duplicates(subset='company_id', keep='first')



In [None]:
#check to the earlier step to see if we are getting the most recent employee counts
max_dates = employee_counts.groupby('company_id')['time_recorded'].max().reset_index().rename(columns={'time_recorded': 'max_date'})
merged = unique_employee_counts.merge(max_dates, on='company_id')
merged['is_most_recent'] = merged['time_recorded'] == merged['max_date']
all_recent = merged['is_most_recent'].all()
print("All records are the most recent:", all_recent)

In [None]:
industries_with_ids = industries.merge(
    industries_ids,
    left_on='industry',
    right_on='industry_name',
    how='left'
)

#drop the original industry columns
industries_with_ids = industries_with_ids.drop(columns=['industry', 'industry_name'])

#convert  non-NaN values in 'industry_id' to int, and keep NaN as is
industries_with_ids['industry_id'] = industries_with_ids['industry_id'].apply(
    lambda x: int(x) if pd.notnull(x) else x
)

print(industries_with_ids)


In [None]:
specialities['speciality'] = specialities['speciality'].astype(str).str.strip()

aggregated_industries_list = industries_with_ids.groupby('company_id')['industry_id'].apply(
lambda x: sorted([i for i in list(x.unique()) if pd.notnull(i)])  # Filter out NaN values
).reset_index()

aggregated_specialities_list = specialities.groupby('company_id')['speciality'].apply(
    lambda x: sorted(list(x.unique()))
).reset_index()

In [None]:
final_company_data = companies.merge(unique_employee_counts, on='company_id', how='left') \
                        .merge(aggregated_industries_list, on='company_id', how='left') \
                        .merge(aggregated_specialities_list, on='company_id', how='left')

In [None]:
print(final_company_data.isnull().sum())


In [None]:
#clean the postings dataframe

postings = pd.read_csv('raw_linkedin_data/postings.csv')
jobs_industries = pd.read_csv('raw_linkedin_data/jobs/job_industries.csv')
job_salaries = pd.read_csv('raw_linkedin_data/jobs/salaries.csv')
job_skills = pd.read_csv('raw_linkedin_data/jobs/job_skills.csv')

benefits = pd.read_csv('raw_linkedin_data/jobs/benefits.csv')

In [None]:
def create_salary_dataset(postings):
    salary_df = postings.copy()
    
    has_salary = (
        salary_df['max_salary'].notna() | 
        salary_df['med_salary'].notna() | 
        salary_df['min_salary'].notna() |
        salary_df['normalized_salary'].notna()
    )
    
    has_period = salary_df['pay_period'].notna()
    
    salary_df = salary_df[has_salary & has_period]
    
    return salary_df

postings_with_salary = create_salary_dataset(postings)
print(f"Original postings: {len(postings)}")
print(f"Postings with salary: {len(postings_with_salary)}")


In [None]:
all_usd = (postings_with_salary['currency'] == 'USD').all()
print(f"All salaries in USD: {all_usd}")


currency_counts = postings_with_salary['currency'].value_counts()
print(currency_counts)

In [None]:
non_usd_jobs = postings_with_salary[postings_with_salary['currency'] != 'USD']
print("\nID\tLocation\tCurrency")
print("_"*50)
for _, job in non_usd_jobs.iterrows():
    print(f"{job['job_id']}\t{job['location']}\t{job['currency']}")


In [None]:
#Since all the jobs that are not in USD are located in United States, we should convert the salaries to USD using these conversion rates.
#1.00 US Dollar = 0.92367131 EUR
#1.00 US Dollar = 1.4352374 CAD
#1.00 US Dollar = 2.00 BBD
#1.00 US Dollar = 1.594355 AUD
#1.00 US Dollar = 0.77381294 GBP


#converstion rates to USD
currency_to_usd = {
    'EUR': 1 / 0.92367131,
    'CAD': 1 / 1.4352374,
    'BBD': 1 / 2.00,
    'AUD': 1 / 1.594355,
    'GBP': 1 / 0.77381294,
    'USD': 1.0  # No conversion needed
}

#convert all non-USD salaries to USD
def convert_to_usd(row):
    if row['currency'] != 'USD':
        conversion_rate = currency_to_usd.get(row['currency'], 1.0)
        
        #convert all salary fields to USD
        for field in ['max_salary', 'med_salary', 'min_salary', 'normalized_salary']:
            if field in row and pd.notna(row[field]):
                row[field] = row[field] * conversion_rate
        
        #update the currency to USD
        row['currency'] = 'USD'
    
    return row

#do this for all the rows
postings_with_salary = postings_with_salary.apply(convert_to_usd, axis=1)

#verify
all_usd = (postings_with_salary['currency'] == 'USD').all()
print(f"All salaries converted to USD: {all_usd}")

In [None]:
unique_work_types = postings_with_salary['formatted_work_type'].unique()
print(unique_work_types)

unique_pay_periods = postings_with_salary['pay_period'].unique()
print(unique_pay_periods)

hourly_by_work_type = postings_with_salary[postings_with_salary['pay_period'] == 'HOURLY']['formatted_work_type'].value_counts()

print("\n\nwork type:")
print(hourly_by_work_type)

In [None]:


#after carefully examining the data, we can see that the normalized salary is sometimes taking the med_salary when available, sometimes average of max and min, and sometimes the max_salary when the med_salary is not available. 
#this column might not be of use to use since it is not consistent. 

postings_with_salary = postings_with_salary.drop(columns=['normalized_salary'])


In [None]:
def create_normalized_annual_salaries(df):
    # Create new columns
    df['norm_min_annual'] = df['min_salary'].copy()
    df['norm_med_annual'] = df['med_salary'].copy()
    df['norm_max_annual'] = df['max_salary'].copy()
    
    #we need to convert all the salaries to annual salaries
    pay_period_multipliers = {
        'HOURLY': lambda row: 20 * 52 if row['work_type'] == 'PART_TIME' else 40 * 52,
        'WEEKLY': 52,
        'BIWEEKLY': 26,
        'MONTHLY': 12,
        'YEARLY': 1,
        'ANNUAL': 1
    }
    
    #apply converstions to annual salaries
    for idx, row in df.iterrows():
        if row['pay_period'] in pay_period_multipliers:
            work_pay_multiplier = pay_period_multipliers[row['pay_period']]
            if callable(work_pay_multiplier):
                work_pay_multiplier = work_pay_multiplier(row)
                
            for col in ['norm_min_annual', 'norm_med_annual', 'norm_max_annual']:
                if pd.notna(row[col]):
                    df.at[idx, col] = row[col] * work_pay_multiplier
    
    #any missing med salary, we can just do an average of the min and max salaries
    mask = pd.isna(df['norm_med_annual']) & pd.notna(df['norm_min_annual']) & pd.notna(df['norm_max_annual'])
    df.loc[mask, 'norm_med_annual'] = (df.loc[mask, 'norm_min_annual'] + df.loc[mask, 'norm_max_annual']) / 2
    
    return df

postings_with_salary = create_normalized_annual_salaries(postings_with_salary)

In [None]:
import re
original_locations = []
for loc in postings_with_salary['location'].dropna().unique():
   if not re.search(r', [A-Z]{2}$', str(loc)):
       original_locations.append(loc)


original_locations.sort()
print(len(original_locations))

#looking at all the locations that are not in the standard format
print(f"Locations without standard format ({len(original_locations)} found):")
for loc in original_locations:
   print(f"- {loc}")

In [None]:
def standardize_locations(df):
    #created a dictionary mappings for state names to their abbreviations
    state_to_abbrev = {
        'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 
        'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
        'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 
        'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
        'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
        'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
        'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
        'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
        'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
        'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
        'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
        'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
        'Wisconsin': 'WI', 'Wyoming': 'WY', 'District of Columbia': 'DC'
    }
    
    #dictionary to metro areas to city and state
    metro_to_city_state = {
    # A
    "Albany, New York Metropolitan Area": "Albany, NY",
    "Albuquerque-Santa Fe Metropolitan Area": "Albuquerque, NM",
    "Appleton-Oshkosh-Neenah Area": "Appleton, WI",
    "Atlanta Metropolitan Area": "Atlanta, GA",
    "Austin, Texas Metropolitan Area": "Austin, TX",
    
    # B
    "Baton Rouge Metropolitan Area": "Baton Rouge, LA",
    "Beaumont-Port Arthur Area": "Beaumont, TX",
    "Bellingham Metropolitan Area": "Bellingham, WA",
    "Blacksburg-Christiansburg-Radford Area": "Blacksburg, VA",
    "Boise Metropolitan Area": "Boise, ID",
    "Buffalo-Niagara Falls Area": "Buffalo, NY",
    
    # C
    "Cape Coral Metropolitan Area": "Cape Coral, FL",
    "Charleston, South Carolina Metropolitan Area": "Charleston, SC",
    "Charlotte Metro": "Charlotte, NC",
    "Cincinnati Metropolitan Area": "Cincinnati, OH",
    "College Station-Bryan Area": "College Station, TX",
    "Columbia, South Carolina Metropolitan Area": "Columbia, SC",
    "Columbus, Ohio Metropolitan Area": "Columbus, OH",
    "Crestview-Fort Walton Beach-Destin Area": "Fort Walton Beach, FL",
    
    # D
    "Dallas-Fort Worth Metroplex": "Dallas, TX",
    "Denver Metropolitan Area": "Denver, CO",
    "Des Moines Metropolitan Area": "Des Moines, IA",
    "Detroit Metropolitan Area": "Detroit, MI",
    
    # E
    "Eau Claire-Menomonie Area": "Eau Claire, WI",
    "Erie-Meadville Area": "Erie, PA",
    
    # F
    "Fayetteville, North Carolina Metropolitan Area": "Fayetteville, NC",
    
    # G
    "Grand Rapids Metropolitan Area": "Grand Rapids, MI",
    "Greater Albany, Georgia Area": "Albany, GA",
    "Greater Asheville": "Asheville, NC",
    "Greater Augusta Area": "Augusta, GA",
    "Greater Bend Area": "Bend, OR",
    "Greater Birmingham, Alabama Area": "Birmingham, AL",
    "Greater Bismarck Area": "Bismarck, ND",
    "Greater Bloomington Area": "Bloomington, IN",
    "Greater Boston": "Boston, MA",
    "Greater Burlington Area": "Burlington, VT",
    "Greater Chattanooga": "Chattanooga, TN",
    "Greater Chicago Area": "Chicago, IL",
    "Greater Chico Area": "Chico, CA",
    "Greater Cleveland": "Cleveland, OH",
    "Greater Colorado Springs Area": "Colorado Springs, CO",
    "Greater Corpus Christi Area": "Corpus Christi, TX",
    "Greater Dothan": "Dothan, AL",
    "Greater Enid Area": "Enid, OK",
    "Greater Eugene-Springfield Area": "Eugene, OR",
    "Greater Fayetteville, AR Area": "Fayetteville, AR",
    "Greater Flagstaff Area": "Flagstaff, AZ",
    "Greater Fort Collins Area": "Fort Collins, CO",
    "Greater Fort Wayne": "Fort Wayne, IN",
    "Greater Goldsboro Area": "Goldsboro, NC",
    "Greater Grand Junction Area": "Grand Junction, CO",
    "Greater Hartford": "Hartford, CT",
    "Greater Houston": "Houston, TX",
    "Greater Indianapolis": "Indianapolis, IN",
    "Greater Jackson, MI Area": "Jackson, MI",
    "Greater Lansing": "Lansing, MI",
    "Greater Lexington Area": "Lexington, KY",
    "Greater Macon": "Macon, GA",
    "Greater Madison Area": "Madison, WI",
    "Greater McAllen Area": "McAllen, TX",
    "Greater Milwaukee": "Milwaukee, WI",
    "Greater Minneapolis-St. Paul Area": "Minneapolis, MN",
    "Greater Morgantown Area": "Morgantown, WV",
    "Greater New Orleans Region": "New Orleans, LA",
    "Greater Orlando": "Orlando, FL",
    "Greater Philadelphia": "Philadelphia, PA",
    "Greater Phoenix Area": "Phoenix, AZ",
    "Greater Pittsburgh Region": "Pittsburgh, PA",
    "Greater Reno Area": "Reno, NV",
    "Greater Richmond Region": "Richmond, VA",
    "Greater Sacramento": "Sacramento, CA",
    "Greater San Luis Obispo Area": "San Luis Obispo, CA",
    "Greater Savannah Area": "Savannah, GA",
    "Greater Scranton Area": "Scranton, PA",
    "Greater Seattle Area": "Seattle, WA",
    "Greater Sioux Falls Area": "Sioux Falls, SD",
    "Greater St. Louis": "St. Louis, MO",
    "Greater Syracuse-Auburn Area": "Syracuse, NY",
    "Greater Tampa Bay Area": "Tampa, FL",
    "Greater Tucson Area": "Tucson, AZ",
    "Greater Wilmington Area": "Wilmington, DE",
    "Green Bay, Wisconsin Metropolitan Area": "Green Bay, WI",
    "Greensboro--Winston-Salem--High Point Area": "Greensboro, NC",
    "Greenville-Spartanburg-Anderson, South Carolina Area": "Greenville, SC",
    
    # H
    "Hampton Roads, Virginia Metropolitan Area": "Norfolk, VA",
    "Hilton Head Island, South Carolina Area": "Hilton Head Island, SC",
    "Honolulu Metropolitan Area": "Honolulu, HI",
    
    # J
    "Johnson City-Kingsport-Bristol Area": "Johnson City, TN",
    
    # K
    "Kansas City Metropolitan Area": "Kansas City, MO",
    "Knoxville Metropolitan Area": "Knoxville, TN",
    
    # L
    "La Crosse-Onalaska Area": "La Crosse, WI",
    "Lafayette, Indiana Metropolitan Area": "Lafayette, IN",
    "Lafayette, Louisiana Metropolitan Area": "Lafayette, LA",
    "Las Vegas Metropolitan Area": "Las Vegas, NV",
    "Lawton Area": "Lawton, OK",
    "Lincoln, Nebraska Metropolitan Area": "Lincoln, NE",
    "Little Rock Metropolitan Area": "Little Rock, AR",
    "Los Angeles Metropolitan Area": "Los Angeles, CA",
    "Louisville Metropolitan Area": "Louisville, KY",
    "Lubbock-Levelland Area": "Lubbock, TX",
    
    # M
    "Maui": "Lahaina, HI",
    "Memphis Metropolitan Area": "Memphis, TN",
    "Metro Jacksonville": "Jacksonville, FL",
    "Metropolitan Fresno": "Fresno, CA",
    "Miami-Fort Lauderdale Area": "Miami, FL",
    "Mobile Metropolitan Area": "Mobile, AL",
    "Modesto-Merced Area": "Modesto, CA",
    
    # N
    "Nashville Metropolitan Area": "Nashville, TN",
    "New Bern-Morehead City Area": "New Bern, NC",
    "New York City Metropolitan Area": "New York, NY",
    
    # O
    "Oklahoma City Metropolitan Area": "Oklahoma City, OK",
    "Omaha Metropolitan Area": "Omaha, NE",
    
    # P
    "Pensacola Metropolitan Area": "Pensacola, FL",
    "Peoria Metropolitan Area": "Peoria, IL",
    "Portland, Maine Metropolitan Area": "Portland, ME",
    "Portland, Oregon Metropolitan Area": "Portland, OR",
    "Pueblo-Cañon City Area": "Pueblo, CO",
    
    # R
    "Raleigh-Durham-Chapel Hill Area": "Raleigh, NC",
    "Rochester, New York Metropolitan Area": "Rochester, NY",
    "Rocky Mount-Wilson Area": "Rocky Mount, NC",
    
    # S
    "Salt Lake City Metropolitan Area": "Salt Lake City, UT",
    "San Antonio, Texas Metropolitan Area": "San Antonio, TX",
    "San Diego Metropolitan Area": "San Diego, CA",
    "San Francisco Bay Area": "San Francisco, CA",
    "South Bend-Mishawaka Region": "South Bend, IN",
    "Springfield, Illinois Metropolitan Area": "Springfield, IL",
    "Springfield, Massachusetts Metropolitan Area": "Springfield, MA",
    
    # T
    "Tallahassee Metropolitan Area": "Tallahassee, FL",
    "Toledo, Ohio Metropolitan Area": "Toledo, OH",
    "Topeka Metropolitan Area": "Topeka, KS",
    "Tulsa Metropolitan Area": "Tulsa, OK",
    
    # U
    "Utica-Rome Area": "Utica, NY",
    
    # W
    "Washington DC-Baltimore Area": "Washington, DC",
    "Waterloo-Cedar Falls Area": "Waterloo, IA",
    "Wichita, Kansas Metropolitan Area": "Wichita, KS",
    "Walla Walla Area" : "Walla Walla, WA",
    
    # Y
    "Youngstown-Warren area": "Youngstown, OH"
}
    
    def standardize_location(location):
        if pd.isna(location):
            return location
            
        location = str(location).strip()
        
        if location in metro_to_city_state:
            return metro_to_city_state[location]
        
        if location == "United States":
            return location
            
        if ", United States" in location:
            location = location.replace(", United States", "")
        
        location = re.sub(r' [Cc]ounty', '', location)
        
        if location in state_to_abbrev:
            return state_to_abbrev[location]
        
        for state_name, abbrev in state_to_abbrev.items():
            if f", {state_name}" in location:
                return location.replace(f", {state_name}", f", {abbrev}")
        
        return location
    
    df['location'] = df['location'].apply(standardize_location)
    
    return df

postings_with_salary = standardize_locations(postings_with_salary)

In [None]:
import re
changed_locations = []
for loc in postings_with_salary['location'].dropna().unique():
   if not re.search(r', [A-Z]{2}$', str(loc)):
       changed_locations.append(loc)

# Sort and print them
changed_locations.sort()
print(len(changed_locations))

print(f"Locations without standard format ({len(changed_locations)} found):")
for loc in changed_locations:
   print(f"- {loc}")

In [None]:
import pandas as pd

#convert job_id to string for all dataframes
postings_with_salary['job_id'] = postings_with_salary['job_id'].astype(str)
jobs_industries['job_id'] = jobs_industries['job_id'].astype(str)
job_skills['job_id'] = job_skills['job_id'].astype(str)

grouped_industry_jobs = jobs_industries.groupby('job_id')['industry_id'].apply(
    lambda x: ','.join(map(str, x))
).reset_index()

grouped_skills_jobs = job_skills.groupby('job_id')['skill_abr'].apply(
    lambda x: ','.join(map(str, x))
).reset_index()

#merge postings with industries
postings_with_salary = pd.merge(
    postings_with_salary,
    grouped_industry_jobs,
    on='job_id',
    how='left'
)

#merge with skills
postings_with_salary = pd.merge(
    postings_with_salary,
    grouped_skills_jobs,
    on='job_id',
    how='left'
)

postings_with_salary['industry_id'] = postings_with_salary['industry_id'].fillna('')
postings_with_salary['skill_abr'] = postings_with_salary['skill_abr'].fillna('')

print(f"Postings shape: {postings_with_salary.shape}")

In [None]:
postings_with_salary['job_id'] = postings_with_salary['job_id'].astype(str)
benefits['job_id'] = benefits['job_id'].astype(str)

#looking at the benefits data, we can see that there are two types of benefits: inferred and non-inferred
inferred_benefits_dict = {}
non_inferred_benefits_dict = {}

for job_id in postings_with_salary['job_id'].unique():
    inferred_benefits_dict[job_id] = []
    non_inferred_benefits_dict[job_id] = []

job_benefits = benefits[benefits['job_id'].isin(postings_with_salary['job_id'])]

for _, row in job_benefits.iterrows():
    job_id = row['job_id']
    benefit_type = row['type']
    inferred = row['inferred']
    
    if inferred == 1:
        inferred_benefits_dict[job_id].append(benefit_type)
    else:
        non_inferred_benefits_dict[job_id].append(benefit_type)

def list_to_comma_str(lst):
    if not lst:
        return ""
    return ",".join(lst)

#create infered and non-inferred benefits columns
postings_with_salary['inferred_benefits'] = postings_with_salary['job_id'].map(
    lambda x: list_to_comma_str(inferred_benefits_dict.get(x, []))
)
postings_with_salary['non_inferred_benefits'] = postings_with_salary['job_id'].map(
    lambda x: list_to_comma_str(non_inferred_benefits_dict.get(x, []))
)

print(f"{postings_with_salary.shape}")

In [None]:
final_company_data.to_csv('processed_linkedin_companies.csv', index=False)
postings_with_salary.to_csv('processed_linkedin_postings_salary.csv', index=False)