In [None]:
import pandas as pd
import numpy as np
import altair as alt
from vega_datasets import data as vega_data
alt.data_transformers.disable_max_rows()  # allow large datasets


In [None]:
DATA_PATH = 'postings.csv' 
df = pd.read_csv(DATA_PATH, dtype=str)

# Convert numeric columns
num_cols = ['job_id','max_salary','med_salary','min_salary','views','applies','normalized_salary','fips','company_id']
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Convert timestamps
for tcol in ['listed_time','original_listed_time','expiry','closed_time']:
    if tcol in df.columns:
        df[tcol] = pd.to_datetime(df[tcol], unit='ms', errors='coerce')


In [None]:
def extract_state(loc):
    if pd.isna(loc): return None
    parts = str(loc).split(',')
    return parts[-1].strip() if len(parts) > 1 else None

df['state'] = df['location'].apply(extract_state) if 'location' in df.columns else None


In [None]:
def compute_salary(row):
    if pd.notna(row.get('normalized_salary')):
        return row['normalized_salary']
    if pd.notna(row.get('med_salary')):
        return row['med_salary']
    lo, hi = row.get('min_salary'), row.get('max_salary')
    if pd.notna(lo) and pd.notna(hi):
        return (lo + hi) / 2
    return np.nan

df['salary'] = df.apply(compute_salary, axis=1)
df['salary_disclosed'] = df['salary'].notna().astype(int)

In [None]:
if 'remote_allowed' in df.columns:
    def to_remote(val):
        if pd.isna(val): return 0
        v = str(val).strip().lower()
        return 1 if v in ['1','true','yes','remote','remote_allowed'] else 0
    df['is_remote'] = df['remote_allowed'].apply(to_remote).astype(int)
else:
    df['is_remote'] = 0

# create month column
if 'listed_time' in df.columns:
    df['month'] = df['listed_time'].dt.to_period('M').dt.to_timestamp()
else:
    df['month'] = pd.NaT

for c in ['applies','views']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')


In [None]:
import pandas as pd
import altair as alt

alt.renderers.enable('default')

# Load your real data
df = pd.read_csv('postings.csv')   # or whatever your filename is
print(df.columns)

In [None]:
print(df['location'].nunique(), "unique locations")
print(df['formatted_experience_level'].unique())
df[['location','formatted_experience_level']].head()

In [None]:
df['state'] = df['location'].str.extract(r',\s*([A-Z]{2})$')
df['state'] = df['state'].fillna('Unknown')

print(df['state'].value_counts().head())

In [None]:
import altair as alt
import pandas as pd

# Make sure we have the 'state' and 'remote_allowed' columns
df_remote = (
    df.groupby('state', dropna=False)
    .agg(
        Total=('job_id', 'count'),
        Remote=('remote_allowed', lambda x: (x == 'True').sum() if x.dtype == object else x.sum())
    )
    .reset_index()
)

# Compute percentage
df_remote['Remote_%'] = (df_remote['Remote'] / df_remote['Total'] * 100).round(1)

# Drop 'Unknown' or NaN states for clarity
df_remote = df_remote[df_remote['state'].notna() & (df_remote['state'] != 'Unknown')]


In [None]:
import altair as alt
import pandas as pd

alt.renderers.enable('default')

# --- Aggregate remote postings by state ---
df_remote = (
    df.groupby('state', dropna=False)
    .agg(
        Total=('job_id', 'count'),
        Remote=('remote_allowed', lambda x: (x == 'True').sum() if x.dtype == object else x.sum())
    )
    .reset_index()
)

df_remote['Remote_%'] = (df_remote['Remote'] / df_remote['Total'] * 100).round(1)

# Filter out unknown or empty states
df_remote = df_remote[df_remote['state'].notna() & (df_remote['state'] != 'Unknown')]

# --- Create the bubble chart ---
bubble_remote = (
    alt.Chart(df_remote)
    .mark_circle(opacity=0.7)
    .encode(
        x=alt.X('state:N', title='State', sort='-y'),
        y=alt.Y('ypos:Q', axis=None, scale=alt.Scale(domain=[0, 2])),  # fixed y position instead of alt.value()
        size=alt.Size('Total:Q', title='Total Postings', scale=alt.Scale(range=[50, 2000])),
        color=alt.Color('Remote_%:Q', title='Remote Jobs (%)', scale=alt.Scale(scheme='reds')),
        tooltip=[
            alt.Tooltip('state:N', title='State'),
            alt.Tooltip('Total:Q', title='Total Postings'),
            alt.Tooltip('Remote:Q', title='Remote Postings'),
            alt.Tooltip('Remote_%:Q', title='Remote %')
        ]
    )
    .transform_calculate(ypos='1')  # assign constant y value safely
    .properties(
        title='Remote Work Distribution by State (Bubble Size = Total Postings, Color = Remote %)',
        width=700,
        height=250
    )
)

# Remove y-axis and tidy up visuals
bubble_remote.configure_axis(
    grid=False,
    domain=False,
    ticks=False,
    labels=False
).configure_view(
    strokeWidth=0
)


In [None]:
df['pay_period'].value_counts()


In [None]:
import numpy as np
import pandas as pd

# make numeric
for col in ['min_salary', 'med_salary', 'max_salary']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# conversion factors to annualized salary
factors = {
    'HOURLY': 40 * 52,    # 40 hours * 52 weeks
    'WEEKLY': 52,
    'BIWEEKLY': 26,
    'MONTHLY': 12,
    'YEARLY': 1
}

# create new annualized columns
for col in ['min_salary', 'med_salary', 'max_salary']:
    df[f'{col}_annual'] = df.apply(
        lambda row: row[col] * factors.get(str(row['pay_period']).upper(), np.nan)
        if pd.notna(row[col]) else np.nan,
        axis=1
    )


In [None]:
df[['pay_period', 'min_salary', 'med_salary', 'max_salary',
    'min_salary_annual', 'med_salary_annual', 'max_salary_annual']].head(10)


In [None]:
df['med_salary_annual'].describe()


In [None]:
upper_limit = df['med_salary_annual'].quantile(0.99)

df_box = df.dropna(subset=['med_salary_annual', 'formatted_experience_level'])

box_annual = (
    alt.Chart(df_box[df_box['med_salary_annual'] <= upper_limit])
    .mark_boxplot(size=40)
    .encode(
        x=alt.X('formatted_experience_level:N',
                title='Experience Level',
                sort=['Internship', 'Entry level', 'Associate',
                      'Mid-Senior level', 'Director', 'Executive']),
        y=alt.Y('med_salary_annual:Q',
                title='Annualized Median Salary ($)',
                scale=alt.Scale(domain=[0, upper_limit])),
        color=alt.Color('formatted_experience_level:N',
                        legend=None,
                        scale=alt.Scale(scheme='oranges'))
    )
    .properties(
        title='Annualized Salary Distribution by Experience Level (Trimmed 95th Percentile)',
        width=700,
        height=400
    )
)
box_annual


In [None]:
# Filter out unknown states for a cleaner view
df_heat = df[df['state'] != 'Unknown'].copy()

# Ensure no missing values in experience column
#df_heat['formatted_experience_level'] = df_heat['formatted_experience_level'].fillna('Unknown')
df_heat = df[(df['state'] != 'Unknown') & (df['formatted_experience_level'] != 'Unknown')].copy()
# Aggregate and plot
heat = (
    alt.Chart(df_heat)
    .mark_rect()
    .encode(
        x=alt.X('state:N', title='State', sort='-y'),
        y=alt.Y('formatted_experience_level:N', title='Experience Level'),
        color=alt.Color('count():Q', title='Number of Postings', scale=alt.Scale(scheme='blues')),
        tooltip=[
            alt.Tooltip('state:N', title='State'),
            alt.Tooltip('formatted_experience_level:N', title='Experience Level'),
            alt.Tooltip('count():Q', title='Number of Postings')
        ]
    )
    .properties(
        title='Job Postings by State and Experience Level',
        width=700,
        height=400
    )
)

heat

In [None]:
import os 
os.makedirs("charts", exist_ok=True) 
# Save charts 
bubble_remote.save('charts/bubble_remote.png')
box_annual.save('charts/box.png') 
heat.save('charts/chart_area.png')