# Data Analyst Job Market: Geographic Salary Analysis

This notebook analyzes salary distributions across US regions and states using interactive visualizations.

## Setup and Imports

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

## 1. Load and Prepare Data

**Note:** Adjust the file path to match your data location

In [None]:
# Load your job data
# df = pd.read_csv('path/to/your/job_data.csv')

# For demonstration, create sample data structure
# Replace this with your actual data loading

# Expected columns:
# - job_title
# - location (city, state format or state abbreviation)
# - salary_min
# - salary_max
# - salary_avg (or calculate as (min + max) / 2)
# - experience_level
# - remote_type (remote/hybrid/onsite)

# Display first few rows
# df.head()

## 2. Data Cleaning and Feature Engineering

In [None]:
# Example data cleaning steps - adjust based on your data

def clean_salary_data(df):
    """
    Clean and standardize salary data
    """
    # Calculate average salary if not present
    if 'salary_avg' not in df.columns and 'salary_min' in df.columns and 'salary_max' in df.columns:
        df['salary_avg'] = (df['salary_min'] + df['salary_max']) / 2
    
    # Remove outliers (optional - salaries outside reasonable range)
    df = df[(df['salary_avg'] >= 40000) & (df['salary_avg'] <= 200000)].copy()
    
    # Extract state from location if needed
    if 'state' not in df.columns and 'location' in df.columns:
        # Assuming format like "City, ST" or just "ST"
        df['state'] = df['location'].str.extract(r'([A-Z]{2})$')[0]
    
    return df

# Apply cleaning
# df = clean_salary_data(df)

# Check data types and missing values
# df.info()
# df.describe()

## 3. Define US Regions

Group states into standard Census regions for analysis

In [None]:
# US Census Bureau regional definitions
US_REGIONS = {
    'Northeast': ['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA'],
    'Midwest': ['IL', 'IN', 'MI', 'OH', 'WI', 'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD'],
    'South': ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV', 'AL', 'KY', 'MS', 'TN', 
              'AR', 'LA', 'OK', 'TX'],
    'West': ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR', 'WA']
}

# Create reverse mapping (state -> region)
STATE_TO_REGION = {}
for region, states in US_REGIONS.items():
    for state in states:
        STATE_TO_REGION[state] = region

# Add region column to dataframe
def add_region(df):
    df['region'] = df['state'].map(STATE_TO_REGION)
    return df

# df = add_region(df)

## 4. Calculate Regional Statistics

In [None]:
# Aggregate by region
def calculate_regional_stats(df):
    regional_stats = df.groupby('region').agg({
        'salary_avg': ['mean', 'median', 'std', 'min', 'max'],
        'job_title': 'count'
    }).round(0)
    
    regional_stats.columns = ['mean_salary', 'median_salary', 'std_salary', 
                              'min_salary', 'max_salary', 'job_count']
    regional_stats = regional_stats.reset_index()
    
    return regional_stats

# Calculate state-level statistics
def calculate_state_stats(df):
    state_stats = df.groupby('state').agg({
        'salary_avg': ['mean', 'median', 'count'],
        'region': 'first'
    }).round(0)
    
    state_stats.columns = ['mean_salary', 'median_salary', 'job_count', 'region']
    state_stats = state_stats.reset_index()
    
    return state_stats

# regional_stats = calculate_regional_stats(df)
# state_stats = calculate_state_stats(df)

# Display results
# print("Regional Salary Statistics:")
# regional_stats.sort_values('mean_salary', ascending=False)

## 5. Interactive US Choropleth Map (Plotly)

This creates an interactive map showing salary by state

In [None]:
def create_salary_choropleth(state_stats, metric='mean_salary'):
    """
    Create interactive US map colored by salary
    
    Parameters:
    -----------
    state_stats : DataFrame with columns ['state', metric, 'job_count']
    metric : str, which salary metric to visualize ('mean_salary', 'median_salary', etc.)
    """
    
    fig = px.choropleth(
        state_stats,
        locations='state',
        locationmode='USA-states',
        color=metric,
        hover_name='state',
        hover_data={
            'state': False,
            metric: ':$,.0f',
            'job_count': ':,',
            'region': True
        },
        color_continuous_scale='RdYlGn',
        scope='usa',
        title=f'Data Analyst Average Salary by State',
        labels={metric: 'Avg Salary ($)'}
    )
    
    fig.update_layout(
        geo=dict(
            showlakes=True,
            lakecolor='rgb(255, 255, 255)'
        ),
        height=600,
        title_font_size=20
    )
    
    return fig

# Create and display map
# fig = create_salary_choropleth(state_stats)
# fig.show()

# Save to HTML for your portfolio
# fig.write_html('salary_map_by_state.html')

## 6. Regional Comparison Visualizations

In [None]:
def create_regional_comparison(regional_stats):
    """
    Create bar chart comparing regions
    """
    fig = go.Figure()
    
    # Add bars for mean salary
    fig.add_trace(go.Bar(
        x=regional_stats['region'],
        y=regional_stats['mean_salary'],
        name='Mean Salary',
        marker_color='lightblue',
        text=regional_stats['mean_salary'].apply(lambda x: f'${x:,.0f}'),
        textposition='outside'
    ))
    
    # Add bars for median salary
    fig.add_trace(go.Bar(
        x=regional_stats['region'],
        y=regional_stats['median_salary'],
        name='Median Salary',
        marker_color='coral',
        text=regional_stats['median_salary'].apply(lambda x: f'${x:,.0f}'),
        textposition='outside'
    ))
    
    fig.update_layout(
        title='Average Data Analyst Salary by US Region',
        xaxis_title='Region',
        yaxis_title='Salary ($)',
        barmode='group',
        height=500,
        showlegend=True
    )
    
    return fig

# fig = create_regional_comparison(regional_stats)
# fig.show()

## 7. Box Plot: Salary Distribution by Region

In [None]:
def create_salary_boxplot(df):
    """
    Create box plot showing salary distributions by region
    """
    fig = px.box(
        df,
        x='region',
        y='salary_avg',
        color='region',
        title='Salary Distribution by US Region',
        labels={'salary_avg': 'Salary ($)', 'region': 'Region'},
        points='outliers'  # Show outliers
    )
    
    fig.update_layout(
        height=500,
        showlegend=False
    )
    
    return fig

# fig = create_salary_boxplot(df)
# fig.show()

## 8. Top 10 States by Average Salary

In [None]:
def create_top_states_chart(state_stats, top_n=10):
    """
    Create horizontal bar chart of top paying states
    """
    top_states = state_stats.nlargest(top_n, 'mean_salary')
    
    fig = go.Figure(go.Bar(
        x=top_states['mean_salary'],
        y=top_states['state'],
        orientation='h',
        marker=dict(
            color=top_states['mean_salary'],
            colorscale='Viridis',
            showscale=True
        ),
        text=top_states['mean_salary'].apply(lambda x: f'${x:,.0f}'),
        textposition='outside',
        hovertemplate='<b>%{y}</b><br>Avg Salary: $%{x:,.0f}<br>Jobs: %{customdata}',
        customdata=top_states['job_count']
    ))
    
    fig.update_layout(
        title=f'Top {top_n} States by Average Data Analyst Salary',
        xaxis_title='Average Salary ($)',
        yaxis_title='State',
        height=500,
        yaxis={'categoryorder': 'total ascending'}
    )
    
    return fig

# fig = create_top_states_chart(state_stats)
# fig.show()

## 9. Scatter Plot: Job Volume vs Average Salary by State

In [None]:
def create_volume_vs_salary_scatter(state_stats):
    """
    Shows relationship between job volume and average salary
    Helps identify high-opportunity states
    """
    fig = px.scatter(
        state_stats,
        x='job_count',
        y='mean_salary',
        size='job_count',
        color='region',
        hover_name='state',
        hover_data={'job_count': ':,', 'mean_salary': ':$,.0f'},
        title='Job Volume vs Average Salary by State',
        labels={'job_count': 'Number of Jobs', 'mean_salary': 'Average Salary ($)'},
        size_max=40
    )
    
    fig.update_layout(
        height=600,
        showlegend=True
    )
    
    return fig

# fig = create_volume_vs_salary_scatter(state_stats)
# fig.show()

## 10. Statistical Summary Table

In [None]:
def create_summary_table(regional_stats):
    """
    Create formatted summary table
    """
    summary = regional_stats.copy()
    
    # Format salary columns as currency
    salary_cols = ['mean_salary', 'median_salary', 'min_salary', 'max_salary']
    for col in salary_cols:
        summary[col] = summary[col].apply(lambda x: f'${x:,.0f}')
    
    summary['job_count'] = summary['job_count'].apply(lambda x: f'{x:,}')
    
    return summary

# summary_table = create_summary_table(regional_stats)
# summary_table

## 11. Experience Level Analysis by Region

If your data includes experience levels, analyze how they vary by region

In [None]:
def analyze_experience_by_region(df):
    """
    Compare salaries across experience levels by region
    """
    # Group by region and experience level
    exp_analysis = df.groupby(['region', 'experience_level']).agg({
        'salary_avg': 'mean',
        'job_title': 'count'
    }).round(0)
    
    exp_analysis.columns = ['mean_salary', 'job_count']
    exp_analysis = exp_analysis.reset_index()
    
    # Create grouped bar chart
    fig = px.bar(
        exp_analysis,
        x='region',
        y='mean_salary',
        color='experience_level',
        barmode='group',
        title='Average Salary by Experience Level and Region',
        labels={'mean_salary': 'Average Salary ($)', 'region': 'Region'},
        text='mean_salary'
    )
    
    fig.update_traces(texttemplate='$%{text:,.0f}', textposition='outside')
    fig.update_layout(height=500)
    
    return fig, exp_analysis

# If you have experience_level column:
# fig, exp_data = analyze_experience_by_region(df)
# fig.show()

## 12. Key Insights and Recommendations

Based on the visualizations above, document your findings:

### Geographic Findings:

1. **Highest Paying Regions:**
   - [List top regions with specific numbers]
   
2. **Job Volume Leaders:**
   - [States/regions with most opportunities]
   
3. **Best Value Markets:**
   - [High salary + high job volume = best opportunity]
   
4. **Experience Level Variations:**
   - [How entry vs senior salaries differ by region]
   
5. **Remote Work Impact:**
   - [If you have remote data, compare remote vs onsite by region]

## 13. Export Results

Save your findings for your portfolio

In [None]:
# Save summary statistics to CSV
# regional_stats.to_csv('regional_salary_summary.csv', index=False)
# state_stats.to_csv('state_salary_summary.csv', index=False)

# Save key visualizations as HTML (interactive)
# salary_map.write_html('outputs/salary_choropleth_map.html')
# regional_comparison.write_html('outputs/regional_comparison.html')

# Or save as static images (PNG)
# salary_map.write_image('outputs/salary_map.png', width=1200, height=800)

print("Analysis complete! Files saved to outputs/")

---

## Next Steps:

1. **Skills Analysis**: Analyze which skills command highest salaries by region
2. **Remote vs Onsite**: Compare salary differences for remote work
3. **Time Trends**: If you have dates, show how market changed over time
4. **Company Size Analysis**: Startup vs Enterprise salary differences

---

*Created for the Data Analyst Job Market Analysis project*