In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/job_industries.csv")



In [8]:
import pandas as pd

# Define file paths (update if needed)
base_path = "/content/drive/MyDrive/"
files = {
    "job_skills": "job_skills.csv",
    "job_industries": "job_industries.csv",
    "popular_companies": "popular_companies.csv",
    "work_type": "work_type.csv"
}

def load_clean_csv(filepath, required_columns=None):
    try:
        df = pd.read_csv(filepath, encoding="utf-8")
    except UnicodeDecodeError:
        df = pd.read_csv(filepath, encoding="ISO-8859-1")

    # Clean column names
    df.columns = df.columns.str.strip().str.lower()

    # Drop rows with missing critical values if specified
    if required_columns:
        df = df.dropna(subset=required_columns)

    return df

job_skills = load_clean_csv(base_path + files["job_skills"], required_columns=["state_code", "skill_desc"])
job_industries = load_clean_csv(base_path + files["job_industries"], required_columns=["state_code", "industry_name"])
popular_companies = load_clean_csv(base_path + files["popular_companies"], required_columns=["state_code", "company_name"])
work_type = load_clean_csv(base_path + files["work_type"], required_columns=["state_code", "formatted_work_type"])

print(" job_skills shape:", job_skills.shape)
print(" job_industries shape:", job_industries.shape)
print(" popular_companies shape:", popular_companies.shape)
print(" work_type shape:", work_type.shape)




 job_skills shape: (205770, 11)
 job_industries shape: (158955, 10)
 popular_companies shape: (23636, 13)
 work_type shape: (114780, 7)


In [None]:
print(job_skills.columns)
print(job_industries.columns)
print(popular_companies.columns)
print(work_type.columns)


Index(['job_id', 'state_code', 'skill_code', 'skill_desc', 'industry_id',
       'industry_name', 'company_id', 'company_name', 'title', 'location',
       'formatted_work_type'],
      dtype='object')
Index(['state_code', 'job_id', 'company_id', 'industry_id', 'industry_name',
       'company_id.1', 'company_name', 'title', 'location',
       'formatted_work_type'],
      dtype='object')
Index(['state_code', 'company_id', 'employee_count', 'follower_count',
       'company_name', 'unnamed: 5', 'unnamed: 6', 'unnamed: 7', 'unnamed: 8',
       'unnamed: 9', 'unnamed: 10', 'unnamed: 11', 'unnamed: 12'],
      dtype='object')
Index(['job_id', 'company_id', 'company_name', 'title', 'location',
       'state_code', 'formatted_work_type'],
      dtype='object')


In [None]:
job_skills.head()


Unnamed: 0,job_id,state_code,skill_code,skill_desc,industry_id,industry_name,company_id,company_name,title,location,formatted_work_type
0,3884428798,NY,MRKT,Marketing,82.0,Book and Periodical Publishing,391906.0,ASSOULINE,Public Relations Intern,New York City Metropolitan Area,Internship
1,3884428798,NY,PR,Public Relations,82.0,Book and Periodical Publishing,391906.0,ASSOULINE,Public Relations Intern,New York City Metropolitan Area,Internship
2,3884428798,NY,WRT,Writing/Editing,82.0,Book and Periodical Publishing,391906.0,ASSOULINE,Public Relations Intern,New York City Metropolitan Area,Internship
3,3887473071,GA,SALE,Sales,48.0,Construction,22292832.0,Renewal by Andersen Metro & Midwest,Outside Sales Consultant,Atlanta Metropolitan Area,Full-time
4,3887465684,MD,FIN,Finance,41.0,Banking,20300.0,Sandy Spring Bank,Seasonal Teller Trainee - Montgomery County( H...,"Olney, MD",Temporary


In [None]:
print(job_skills['state_code'].isnull().sum())
print(job_industries['industry_name'].isnull().sum())


0
0


In [None]:
print("job_skills shape:", job_skills.shape)
print("job_industries shape:", job_industries.shape)
print("popular_companies shape:", popular_companies.shape)
print("work_type shape:", work_type.shape)


job_skills shape: (205770, 11)
job_industries shape: (158955, 10)
popular_companies shape: (23636, 13)
work_type shape: (114780, 7)


In [None]:
print(job_skills['state_code'].unique())
print(work_type['formatted_work_type'].unique())


['NY' 'GA' 'MD' 'MA' '0' 'AR' 'TX' 'NE' 'SC' 'IL' 'NV' 'AZ' 'NJ' 'DC' 'PA'
 'IA' 'MI' 'FL' 'WI' 'CA' 'VA' 'MO' 'OH' 'HI' 'CT' 'VT' 'MN' 'WA' 'IN'
 'LA' 'NC' 'UT' 'NM' 'CO' 'OR' 'AL' 'MS' 'OK' 'KS' 'TN' 'KY' 'NH' 'ID'
 'RI' 'MT' 'DE' 'SD' 'ND' 'WY' 'WV' 'ME' 'ON' 'AK' 'MI ' 'QC']
['Full-time' 'Internship' 'Part-time' 'Contract' 'Temporary' 'Other'
 'Volunteer']


In [None]:

job_skills.to_csv(base_path + "cleaned_job_skills.csv", index=False)
job_industries.to_csv(base_path + "cleaned_job_industries.csv", index=False)
popular_companies.to_csv(base_path + "cleaned_popular_companies.csv", index=False)
work_type.to_csv(base_path + "cleaned_work_type.csv", index=False)
