This notebook aims to clean the data and do some initial analysis.

In [1]:
# Import necessary libraries.
import ast
import pandas as pd
import re

from bs4 import BeautifulSoup

In [2]:
original_df = pd.read_csv("data/ads-50k.csv")

In [3]:
def clean_html_text(raw_html: str) -> str:
    if not isinstance(raw_html, str):
        return ""
    
    # Parse HTML
    soup = BeautifulSoup(raw_html, "html.parser")
    text = soup.get_text(separator="\n")
    
    # Replace HTML entities and unwanted chars
    text = text.replace(u'\xa0', ' ')
    text = re.sub(r'&[a-z]+;', ' ', text)
    
    # Fix bullet characters
    text = re.sub(r'[\u2022\u2023\u25E6\u2043\u2219•]', '-', text)
    text = re.sub(r'&bull;', '-', text)
    
    # Normalize spacing and line breaks
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s*-\s*', ' - ', text)
    text = text.strip()
    
    # # Optional: remove contact info
    # text = re.sub(r'\b\d{8,}\b', '[PHONE]', text)
    # text = re.sub(r'\S+@\S+', '[EMAIL]', text)
    
    return text

In [4]:
df = original_df.copy()
df['job_description_clean'] = df['content'].apply(clean_html_text)

In [5]:
df['job_metadata'] = df['metadata'].apply(lambda x: ast.literal_eval(x))
df['classification'] = df['job_metadata'].apply(lambda x: x.get('classification', {}).get('name'))
df['subClassification'] = df['job_metadata'].apply(lambda x: x.get('subClassification', {}).get('name'))
df['area'] = df['job_metadata'].apply(lambda x: x.get('area', {}).get('name'))
df['location'] = df['job_metadata'].apply(lambda x: x.get('location', {}).get('name'))
df['suburb'] = df['job_metadata'].apply(lambda x: x.get('suburb', {}).get('name'))
df['workType'] = df['job_metadata'].apply(lambda x: x.get('workType', {}).get('name'))

In [6]:
# df_random_rows = df.sample(n=50)
# df_random_rows.to_csv("data/df_random_rows.csv", index=False)
print("Full Columns: \n", df.columns)

Full Columns: 
 Index(['id', 'title', 'abstract', 'content', 'metadata',
       'job_description_clean', 'job_metadata', 'classification',
       'subClassification', 'area', 'location', 'suburb', 'workType'],
      dtype='object')


In [7]:
df.drop(["content", "metadata", "job_metadata"], axis=1, inplace=True)
df.head()

Unnamed: 0,id,title,abstract,job_description_clean,classification,subClassification,area,location,suburb,workType
0,38915469,Recruitment Consultant,We are looking for someone to focus purely on ...,Are you looking to join a thriving business th...,Education & Training,Other,,Sydney,,Full Time
1,38934839,Computers Salesperson - Coburg,Passionate about exceptional customer service?...,· Casual hours as required (transition to Part...,Retail & Consumer Products,Retail Assistants,Northern Suburbs,Melbourne,Coburg,Casual/Vacation
2,38946054,Senior Developer | SA,Readifarians are known for discovering the lat...,Readify helps organizations innovate with tech...,Information & Communication Technology,Consultants,,Adelaide,,Full Time
3,38833950,Senior Commercial Property Manager | Leading T...,~ Rare opportunity for a Senior PM to step int...,WayPoint Recruitment have partnered up with a ...,Real Estate & Property,"Commercial Sales, Leasing & Property Mgmt",CBD & Inner Suburbs,Melbourne,Melbourne,Full Time
4,38856271,Technology Manager | Travel Industry,Rare opportunity for an experienced Technology...,This is a key role within a market leading Tra...,Information & Communication Technology,Management,,Auckland,,Full Time


In [8]:
"""
location and work type are really useful and has no None.
there may be some duplications.
"""
df.describe(include='all')

Unnamed: 0,id,title,abstract,job_description_clean,classification,subClassification,area,location,suburb,workType
count,50000.0,50000,50000,50000,50000,50000,32844,50000,37002,50000
unique,,36207,46052,47485,30,337,69,87,2546,4
top,,Project Manager,.,"At ALDI, our people are the key to our success...",Information & Communication Technology,Other,CBD & Inner Suburbs,Sydney,Sydney,Full Time
freq,,194,27,25,5235,2688,7092,13215,5373,34858
mean,38912010.0,,,,,,,,,
std,74104.46,,,,,,,,,
min,34651090.0,,,,,,,,,
25%,38870540.0,,,,,,,,,
50%,38918270.0,,,,,,,,,
75%,38961170.0,,,,,,,,,


In [9]:
df_no_duplicates = df[["title", "abstract", "job_description_clean", "classification", "subClassification", "area", "location", "suburb", "workType"]].drop_duplicates()

In [10]:
df_no_duplicates.describe(include='all')

Unnamed: 0,title,abstract,job_description_clean,classification,subClassification,area,location,suburb,workType
count,49375,49375,49375,49375,49375,32373,49375,36556,49375
unique,36207,46052,47485,30,337,69,87,2546,4
top,Project Manager,.,"At ALDI, our people are the key to our success...",Information & Communication Technology,Other,CBD & Inner Suburbs,Sydney,Sydney,Full Time
freq,186,27,24,5167,2660,6989,12942,5236,34430


In [11]:
print("Number of unique classifications:\n", len(df_no_duplicates["classification"].unique()))
df_no_duplicates["classification"].unique()

Number of unique classifications:
 30


array(['Education & Training', 'Retail & Consumer Products',
       'Information & Communication Technology', 'Real Estate & Property',
       'Mining, Resources & Energy', 'Accounting',
       'Community Services & Development',
       'Manufacturing, Transport & Logistics', 'Hospitality & Tourism',
       'Engineering', 'Sales', 'Marketing & Communications',
       'Healthcare & Medical', 'Banking & Financial Services',
       'Human Resources & Recruitment', 'Trades & Services',
       'Construction', 'Advertising, Arts & Media',
       'Administration & Office Support', 'Sport & Recreation',
       'Call Centre & Customer Service', 'Legal', 'Consulting & Strategy',
       'Government & Defence', 'Science & Technology',
       'Design & Architecture', 'Farming, Animals & Conservation',
       'CEO & General Management', 'Insurance & Superannuation',
       'Self Employment'], dtype=object)

In [12]:
# df_no_duplicates.to_csv("data/df_no_duplicates_.csv", index=False)