In [None]:
import altair as alt
import pandas as pd
import numpy as np

# Enable Altair to work with larger datasets
alt.data_transformers.enable('default', max_rows=None)


In [None]:
df = pd.read_csv("postings.csv")
def prepare_data(df):
    """Prepare and clean the job postings dataset"""
    data = df.copy()
    
    # Clean salary data
    if 'normalized_salary' in data.columns:
        data['salary'] = data['normalized_salary']
    else:
        data['salary'] = data[['min_salary', 'max_salary', 'med_salary']].mean(axis=1)
    
    # Remove outliers and missing values
    data = data.dropna(subset=['salary', 'formatted_experience_level'])
    data = data[(data['salary'] >= 20000) & (data['salary'] <= 500000)]
    
    # Clean experience level
    data['experience_level'] = data['formatted_experience_level'].fillna('Not Specified')
    
    # Remote flag
    data['remote'] = data['remote_allowed'].fillna(0).astype(bool)
    
    # Calculate engagement rate
    data['engagement_rate'] = (data['applies'] / (data['views'] + 1)) * 100
    data['engagement_rate'] = data['engagement_rate'].clip(0, 100)
    
    # Work type
    data['work_type'] = data['formatted_work_type'].fillna('Not Specified')
    
    # Create salary bins
    data['salary_bin'] = pd.cut(data['salary'], 
                                 bins=[0, 50000, 75000, 100000, 150000, 500000],
                                 labels=['<50K', '50-75K', '75-100K', '100-150K', '>150K'])
    
    return data

# Prepare the data
data = prepare_data(df)

print(f"Data loaded: {len(data)} job postings")
print(f"Columns available: {list(data.columns)}")
print(f"\nData preview:")
print(data[['experience_level', 'salary', 'work_type']].head(10))
print(f"\nExperience levels: {data['experience_level'].value_counts()}")
print(f"\nWork types: {data['work_type'].value_counts()}")
print(f"\nSalary range: ${data['salary'].min():,.0f} - ${data['salary'].max():,.0f}")
print(f"\nSalary statistics:")
print(data['salary'].describe())
print(f"\nChecking for NaN values:")
print(f"NaN in salary: {data['salary'].isna().sum()}")
print(f"NaN in experience_level: {data['experience_level'].isna().sum()}")
print(f"NaN in work_type: {data['work_type'].isna().sum()}")


In [None]:
viz_data = data.sample(n=min(10000, len(data)), random_state=42).copy()

print(f"Using {len(viz_data)} jobs for visualization")
print(f"Salary range: ${viz_data['salary'].min():,.0f} to ${viz_data['salary'].max():,.0f}")
print(f"Engagement range: {viz_data['engagement_rate'].min():.1f}% to {viz_data['engagement_rate'].max():.1f}%")


In [None]:
# VISUALIZATION 1: Scatter Plot with Brush Selection
# Interaction: Brush selection (manipulating view) + Filtering
# Create brush selection
brush = alt.selection_interval(name='brush')

# Main scatter plot
scatter = alt.Chart(viz_data).mark_circle(size=80, opacity=0.7).encode(
    x=alt.X('salary:Q', 
            scale=alt.Scale(domain=[20000, 500000]),
            title='Annual Salary ($)',
            axis=alt.Axis(format='$,.0f')),
    y=alt.Y('engagement_rate:Q',
            scale=alt.Scale(domain=[0, 100]),
            title='Engagement Rate (%)'),
    color=alt.condition(
        brush,
        alt.Color('experience_level:N', 
                  scale=alt.Scale(scheme='tableau10'),
                  title='Experience Level'),
        alt.value('lightgray')
    ),
    tooltip=[
        alt.Tooltip('title:N', title='Job Title'),
        alt.Tooltip('salary:Q', title='Salary', format='$,.0f'),
        alt.Tooltip('engagement_rate:Q', title='Engagement %', format='.1f'),
        alt.Tooltip('experience_level:N', title='Experience'),
        alt.Tooltip('work_type:N', title='Work Type'),
        alt.Tooltip('remote:N', title='Remote')
    ]
).properties(
    width=800,
    height=400,
    title='Salary vs Engagement Rate - Brush to Select and Filter Below'
).add_params(brush)

# Linked bar chart showing work type distribution
bar_chart = alt.Chart(viz_data).mark_bar().encode(
    x=alt.X('count():Q', title='Number of Jobs'),
    y=alt.Y('work_type:N', sort='-x', title='Work Type'),
    color=alt.Color('work_type:N', scale=alt.Scale(scheme='category20'), legend=None),
    tooltip=[
        alt.Tooltip('work_type:N', title='Work Type'),
        alt.Tooltip('count():Q', title='Count'),
        alt.Tooltip('mean(salary):Q', title='Avg Salary', format='$,.0f')
    ]
).properties(
    width=800,
    height=200,
    title='Work Type Distribution (Filtered by Brush Selection Above)'
).transform_filter(brush)

# Combine visualization 1
viz1 = alt.vconcat(scatter, bar_chart).properties(
    title={
        "text": "VISUALIZATION 1: Brush Selection",
        "subtitle": "Interaction: Click and drag on scatter plot to filter bar chart below",
        "fontSize": 16
    }
)
viz1

In [None]:
# VISUALIZATION 2: Heatmap with Click Selection + Navigation
# Interaction: Click selection (manipulating data) + Aggregation + Navigation

# Create click selection
click = alt.selection_point(fields=['experience_level'], name='click')

# Heatmap showing salary by experience and work type
heatmap = alt.Chart(viz_data).mark_rect().encode(
    x=alt.X('work_type:N', title='Work Type', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('experience_level:N', title='Experience Level'),
    color=alt.Color('mean(salary):Q', 
                    title='Avg Salary',
                    scale=alt.Scale(scheme='blues'),
                    legend=alt.Legend(format='$,.0f')),
    opacity=alt.condition(click, alt.value(1.0), alt.value(0.4)),
    stroke=alt.condition(click, alt.value('black'), alt.value(None)),
    strokeWidth=alt.condition(click, alt.value(3), alt.value(0)),
    tooltip=[
        alt.Tooltip('experience_level:N', title='Experience'),
        alt.Tooltip('work_type:N', title='Work Type'),
        alt.Tooltip('mean(salary):Q', title='Avg Salary', format='$,.0f'),
        alt.Tooltip('count():Q', title='Count')
    ]
).properties(
    width=600,
    height=300,
    title='Average Salary by Experience Level and Work Type - Click to See Details'
).add_params(click)

# Detail view: salary distribution for selected experience level
detail_histogram = alt.Chart(viz_data).mark_bar().encode(
    x=alt.X('salary:Q',
            bin=alt.Bin(maxbins=20),
            title='Salary',
            axis=alt.Axis(format='$,.0f')),
    y=alt.Y('count():Q', title='Number of Jobs'),
    color=alt.value('steelblue'),
    tooltip=[
        alt.Tooltip('salary:Q', bin=alt.Bin(maxbins=20), title='Salary Range', format='$,.0f'),
        alt.Tooltip('count():Q', title='Count')
    ]
).properties(
    width=600,
    height=200,
    title='Detailed Salary Distribution for Selected Experience Level'
).transform_filter(click)

# Combine visualization 2
viz2 = alt.vconcat(heatmap, detail_histogram).properties(
    title={
        "text": "VISUALIZATION 2: Click Selection with Detail View",
        "subtitle": "Interaction: Click on heatmap cell to see detailed salary distribution below",
        "fontSize": 16
    }
)
viz2