In [1]:
import pandas as pd
import json
import plotly.express as px
import re

# 1.Load & Preprocess Data

In [2]:
# Read the JSON file and convert it into a pandas DataFrame
with open('data/ads-50k.json', 'r') as f:
    records = [json.loads(line) for line in f]

job_df = pd.DataFrame(records)

# Flatten the metadata column
metadata_df = pd.json_normalize(job_df['metadata']).add_prefix('metadata.')
job_df = pd.concat([job_df, metadata_df], axis=1)

In [3]:
# Function to remove HTML tags
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    clean_text = re.sub(r'<.*?>', ' ', str(text))
    # Replace multiple spaces with a single space
    clean_text = re.sub(r'\s+', ' ', clean_text)
    # Strip leading and trailing spaces
    clean_text = clean_text.strip()
    return clean_text

# Apply the cleaning function to the content column
job_df['cleaned_content'] = job_df['content'].apply(clean_html)
    

# 2. Exploratory Data Analysis (EDA)

In [4]:
# Function to generate null percentage dataframe
def generate_null_percentage(df: pd.DataFrame) -> pd.DataFrame:
    # Calculate null count and percentage for each column
    null_counts = df.isnull().sum()
    total_rows = len(df)
    null_percentages = (null_counts / total_rows * 100).round(2)
    
    # Create result dataframe
    null_df = pd.DataFrame({
        'Column': null_counts.index,
        'Null Count': null_counts.values,
        'Null Percentage': null_percentages.values
    })
    
    # Sort by null percentage in descending order
    null_df = null_df.sort_values('Null Percentage', ascending=False)
    
    return null_df

# Function to plot distribution of a specified column
def plot_distribution(df: pd.DataFrame, 
                      column_name: str,
                      title: str | None = None,
                      top_n: int | None = 10)-> None:
    # Calculate and sort percentage distribution in descending order
    dist = df[column_name].value_counts(normalize=True).sort_values(ascending=False).head(top_n) * 100
    
    # Sort for better visualization
    dist = dist.sort_values(ascending=True) 
    
    # Set default title if none provided
    if title is None:
        title = f'Distribution of {column_name} (%)'
    
    # Create horizontal bar chart
    fig = px.bar(
        dist,
        x=dist.values,
        y=dist.index,
        orientation='h',
        labels={'x': 'Percentage', 'y': ''},  # Empty y label
        title=title,
        text=dist.values.round(1)  # Show values on bars
    )
    
    # Format the text to show percentage with 1 decimal place
    fig.update_traces(texttemplate='%{text:.1f}%', textposition='inside')
    
    # Hide the y-axis title
    fig.update_layout(yaxis_title=None)
    
    fig.show()

def plot_word_count_distribution(df: pd.DataFrame, 
                                column_name: str, 
                                title: str | None = None,
                                bins: int = 20) -> None:

    # Calculate word counts for the specified column
    # Filter out missing values first
    word_counts = df[column_name].dropna().apply(lambda x: len(str(x).split()))
    
    # Set default title if none provided
    if title is None:
        title = f'Distribution of Word Count in {column_name}'
    
    # Create histogram
    fig = px.histogram(
        word_counts, 
        x=word_counts,
        nbins=bins,
        labels={'x': 'Number of Words', 'y': 'Count'},
        title=title,
        text_auto=True  # Show count values on bars
    )
    
    # Add average line
    mean_value = word_counts.mean()
    fig.add_vline(x=mean_value, line_dash="dash", line_color="red",
                  annotation_text=f"Mean: {mean_value:.1f} words",
                  annotation_position="top right")
    
    # Format layout
    fig.update_layout(bargap=0.1)
    
    fig.show()

In [14]:
sorted(job_df['metadata.workType.name'].unique().tolist())

['Casual/Vacation', 'Contract/Temp', 'Full Time', 'Part Time']

In [5]:
job_df.head()

Unnamed: 0,id,title,abstract,content,metadata,metadata.additionalSalaryText,metadata.standout.bullet1,metadata.standout.bullet2,metadata.standout.bullet3,metadata.classification.name,metadata.subClassification.name,metadata.location.name,metadata.workType.name,metadata.area.name,metadata.suburb.name,cleaned_content
0,38915469,Recruitment Consultant,We are looking for someone to focus purely on ...,<HTML><p>Are you looking to join a thriving bu...,{'standout': {'bullet1': 'Join a Sector that i...,commission,Join a Sector that is considered Recession Pro...,Excellent opportunity for Career Progression ...,Make a Diference whilst earning Money and havi...,Education & Training,Other,Sydney,Full Time,,,Are you looking to join a thriving business th...
1,38934839,Computers Salesperson - Coburg,Passionate about exceptional customer service?...,<HTML><p>&middot;&nbsp;&nbsp;Casual hours as r...,{'additionalSalaryText': 'Attractive Commissio...,Attractive Commission - Uncapped Earning Poten...,,,,Retail & Consumer Products,Retail Assistants,Melbourne,Casual/Vacation,Northern Suburbs,Coburg,&middot;&nbsp;&nbsp;Casual hours as required (...
2,38946054,Senior Developer | SA,Readifarians are known for discovering the lat...,<HTML><p>Readify helps organizations innovate ...,"{'standout': {'bullet1': 'Design, develop, tes...",,"Design, develop, test and deliver custom softw...",Keep your skills current with 20 x paid profes...,Flexible & inclusive work environment,Information & Communication Technology,Consultants,Adelaide,Full Time,,,Readify helps organizations innovate with tech...
3,38833950,Senior Commercial Property Manager | Leading T...,~ Rare opportunity for a Senior PM to step int...,<HTML><p><strong>WayPoint Recruitment&nbsp;</s...,{'additionalSalaryText': '$140k + Car Park - C...,$140k + Car Park - Call James Calleja 0430 058...,,,,Real Estate & Property,"Commercial Sales, Leasing & Property Mgmt",Melbourne,Full Time,CBD & Inner Suburbs,Melbourne,WayPoint Recruitment&nbsp; have partnered up w...
4,38856271,Technology Manager | Travel Industry,Rare opportunity for an experienced Technology...,<HTML>This is a key role within a market leadi...,{'standout': {'bullet1': 'Lead overarching str...,$110k - $120k p.a. + Numerous Perks!,Lead overarching strategy around Technology wi...,You will be responsible for all Technology and...,Competitive Salary package of $110K - $120K + ...,Information & Communication Technology,Management,Auckland,Full Time,,,This is a key role within a market leading Tra...


In [6]:
# Generate null percentage dataframe
null_df = generate_null_percentage(job_df)
null_df

Unnamed: 0,Column,Null Count,Null Percentage
5,metadata.additionalSalaryText,33651,67.3
6,metadata.standout.bullet1,23315,46.63
7,metadata.standout.bullet2,23315,46.63
8,metadata.standout.bullet3,23315,46.63
13,metadata.area.name,17156,34.31
14,metadata.suburb.name,12998,26.0
0,id,0,0.0
1,title,0,0.0
2,abstract,0,0.0
3,content,0,0.0


In [7]:
columns_to_plot_distribution = [
    'metadata.classification.name',
    'metadata.subClassification.name',
    'metadata.location.name',
    'metadata.area.name',
    'metadata.suburb.name',
    'metadata.workType.name'
]

for col in columns_to_plot_distribution:
    plot_distribution(job_df, col)
# plot_distribution()

In [8]:
columns_to_plot_word_count = [
    'abstract',
    'cleaned_content'
]

for col in columns_to_plot_word_count:
    plot_word_count_distribution(job_df, col)