In [17]:
import pandas as pd

eda = pd.read_csv('./data/eda_data.csv')
eda.head()

Unnamed: 0,COMPANY,LOCATION,POSTED,MIN_EDULEVELS_NAME,MAX_EDULEVELS_NAME,MIN_YEARS_EXPERIENCE,MAX_YEARS_EXPERIENCE,TITLE,SKILLS,SPECIALIZED_SKILLS,...,COMMON_SKILLS,SOFTWARE_SKILLS,SOC_2021_4_NAME,NAICS_2022_6,NAICS2_NAME,REMOTE_TYPE_NAME,SALARY,TITLE_NAME,SKILLS_NAME,SPECIALIZED_SKILLS_NAME
0,894731,"{\n ""lat"": 33.20763,\n ""lon"": -92.6662674\n}",2024-06-02,Bachelor's degree,Master's degree,2.0,2.0,ET29C073C03D1F86B4,"[\n ""KS126DB6T061MHD7RTGQ"",\n ""KS126706DPFD3...","[\n ""KS126DB6T061MHD7RTGQ"",\n ""KS128006L3V0H...",...,"[\n ""KS126706DPFD3354M7YK"",\n ""KS1280B68GD79...","[\n ""KS440W865GC4VRBW6LJP"",\n ""KS13USA80NE38...",Data Scientists,441330,Retail Trade,[None],116348.5,Enterprise Analysts,"[\n ""Merchandising"",\n ""Mathematics"",\n ""Pr...","[\n ""Merchandising"",\n ""Predictive Modeling""..."
1,133098,"{\n ""lat"": 44.3106241,\n ""lon"": -69.7794897\n}",2024-06-02,No Education Listed,Master's degree,3.0,3.0,ET21DDA63780A7DC09,"[\n ""KS122626T550SLQ7QZ1C"",\n ""KS123YJ6KVWC9...","[\n ""KS122626T550SLQ7QZ1C"",\n ""KS123YJ6KVWC9...",...,[],"[\n ""BGSBF3F508F7F46312E3"",\n ""ESEA839CED378...",Data Scientists,561320,Administrative and Support and Waste Managemen...,Remote,116348.5,Oracle Consultants,"[\n ""Procurement"",\n ""Financial Statements"",...","[\n ""Procurement"",\n ""Financial Statements"",..."
2,39063746,"{\n ""lat"": 32.7766642,\n ""lon"": -96.7969879\n}",2024-06-02,Bachelor's degree,Master's degree,5.0,3.773903,ET3037E0C947A02404,"[\n ""KS1218W78FGVPVP2KXPX"",\n ""ESF3939CE1F80...","[\n ""ESF3939CE1F80C10C327"",\n ""KS120GV6C72JM...",...,"[\n ""KS1218W78FGVPVP2KXPX"",\n ""BGS1ADAA36DB6...","[\n ""KS126HY6YLTB9R7XJC4Z""\n]",Data Scientists,524291,Finance and Insurance,[None],116348.5,Data Analysts,"[\n ""Management"",\n ""Exception Reporting"",\n...","[\n ""Exception Reporting"",\n ""Data Analysis""..."
3,37615159,"{\n ""lat"": 33.4483771,\n ""lon"": -112.0740373\n}",2024-06-02,No Education Listed,Master's degree,3.0,3.773903,ET2114E0404BA30075,"[\n ""KS123QX62QYTC4JF38H8"",\n ""KS7G6NP6R6L1H...","[\n ""KS123QX62QYTC4JF38H8"",\n ""KS441PQ64HT13...",...,"[\n ""KS7G6NP6R6L1H1SKFTSY"",\n ""KS1218W78FGVP...","[\n ""KS4409D76NW1S5LNCL18"",\n ""ESC7869CF7378...",Data Scientists,522110,Finance and Insurance,[None],116348.5,Management Analysts,"[\n ""Exit Strategies"",\n ""Reliability"",\n ""...","[\n ""Exit Strategies"",\n ""User Story"",\n ""H..."
4,0,"{\n ""lat"": 37.6392595,\n ""lon"": -120.9970014\n}",2024-06-02,No Education Listed,Master's degree,5.486539,3.773903,ET0000000000000000,[],[],...,[],[],Data Scientists,999999,Unclassified Industry,[None],92500.0,Unclassified,[],[]


In [18]:
# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
            'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']

match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)

eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') 
eda['DATA_ANALYST_JOB'].value_counts()

DATA_ANALYST_JOB
False    37052
True     32148
Name: count, dtype: int64

In [25]:
import plotly.express as px
import plotly.graph_objects as go

# Group data (same as before)
df_grouped = eda.groupby(['DATA_ANALYST_JOB', 'NAICS2_NAME']).size().reset_index(name='Job_Count')

# Shorten the industry names (NAICS2_NAME) for better readability
# Assuming NAICS2_NAME has long names, we'll map them to shorter versions
# Example: Replace long names with abbreviations or shorter terms
industry_short_names = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services',
    # Add more mappings as needed based on your dataset
}

# Apply the shortened names to the dataframe
df_grouped['NAICS2_NAME_SHORT'] = df_grouped['NAICS2_NAME'].map(industry_short_names).fillna(df_grouped['NAICS2_NAME'])

# Define a vibrant and modern color palette
color_map = {
    False: "#FF6B6B",  # Coral red for False
    True: "#4ECDC4"    # Teal for True
}

# Create the bar plot with Plotly Express
fig = px.bar(df_grouped, 
             x='NAICS2_NAME_SHORT',  # Use shortened names
             y='Job_Count',
             color='DATA_ANALYST_JOB',
             title="Data & Business Analytics Job Trends",
             labels={'NAICS2_NAME_SHORT': 'Industry', 'Job_Count': 'Number of Jobs'},
             barmode='group',
             color_discrete_map=color_map
            )

# Beautify the layout with a modern, clean design
fig.update_layout(
    # Sizing and margins
    autosize=True,
    width=1100,  # Slightly narrower for better focus
    height=650,  # Adjusted height
    margin=dict(l=50, r=50, t=90, b=120),  # Adjusted margins for cleaner look

    # Background and plot styling
    plot_bgcolor='rgba(240, 240, 245, 1)',  # Softer gray background
    paper_bgcolor='rgba(255, 255, 255, 1)',  # White paper background
    font=dict(family="Helvetica, sans-serif", size=14, color="#2D3748"),  # Modern font

    # Title styling
    title=dict(
        text="Data & Business Analytics Job Trends",
        font=dict(size=26, color="#2D3748", family="Helvetica, sans-serif"),
        x=0.5,  # Center the title
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),

    # X-axis styling
    xaxis=dict(
        title="Industry",
        title_font=dict(size=18, color="#2D3748"),
        tickfont=dict(size=13, color="#4A5568"),
        tickangle=-30,  # Slightly less aggressive rotation
        gridcolor="rgba(200, 200, 200, 0.2)",  # Very light gridlines
        linecolor="#2D3748",
        linewidth=2,
        showline=True
    ),

    # Y-axis styling
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=18, color="#2D3748"),
        tickfont=dict(size=13, color="#4A5568"),
        range=[0, df_grouped['Job_Count'].max() * 1.15],  # Slightly tighter range
        gridcolor="rgba(200, 200, 200, 0.2)",
        linecolor="#2D3748",
        linewidth=2,
        showline=True
    ),

    # Legend styling
    legend=dict(
        title="Job Type",
        font=dict(size=13, color="#2D3748"),
        bgcolor="rgba(255, 255, 255, 0.95)",
        bordercolor="#2D3748",
        borderwidth=1,
        x=1.02,  # Position outside
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),

    # Hover and interactivity
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="rgba(255, 255, 255, 0.9)",
        font_size=13,
        font_family="Helvetica, sans-serif",
        font_color="#2D3748",
        bordercolor="#2D3748"
    ),

    # Add a subtle shadow effect for depth
    bargap=0.2,  # Add space between bars for clarity
)

# Customize the bars
fig.update_traces(
    marker=dict(
        line=dict(width=1.2, color="#2D3748"),  # Thinner border for elegance
        # Add a gradient effect to the bars
        coloraxis=None,
    ),
    opacity=0.85,  # Slight transparency for softness
    text=df_grouped['Job_Count'],  # Add labels on bars
    textposition='outside',  # Place labels outside for clarity
    textfont=dict(size=12, color="#2D3748", family="Helvetica, sans-serif"),
    texttemplate='%{text}',  # Show just the number
)

# Add a subtle annotation for the highest job count
max_job = df_grouped.loc[df_grouped['Job_Count'].idxmax()]
fig.add_annotation(
    x=max_job['NAICS2_NAME_SHORT'],
    y=max_job['Job_Count'] * 1.1,
    text=f"Top: {max_job['NAICS2_NAME_SHORT']}<br>{max_job['Job_Count']} Jobs",
    showarrow=True,
    arrowhead=1,
    ax=20,
    ay=-40,
    font=dict(size=13, color="#2D3748", family="Helvetica, sans-serif"),
    bgcolor="rgba(255, 255, 255, 0.85)",
    bordercolor="#2D3748",
    borderwidth=1,
    borderpad=4
)

# Show the plot
fig.show()

# Save as an HTML file for Quarto embedding
fig.write_html("figures/plot1_jobtrends_super_pretty.html")