In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import statsmodels.api as sm

# Set default theme for better looking plots
pio.templates.default = 'plotly_white'

# Custom color palette
custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']

In [7]:
df = pd.read_csv(r'C:\Users\LENOVO\Downloads\Linkedin-analysis-project\graphs\jobs.csv')
df

Unnamed: 0,Employment type,Industries,Job function,Seniority level,company,company_id,context,date,description,education,location,months_experience,post_id,post_url,sal_high,sal_low,salary,title
0,Full-time,Broadcast Media,Information Technology,Mid-Senior level,CyberCoders,21836.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",,Job Title: Senior Data Engineer Location: Alex...,bachelor degree,"Alexandria, VA",60.0,2632814552,https://www.linkedin.com/jobs/view/senior-data...,,,,Senior Data Engineer
1,Full-time,"Hospital & Health Care, Medical Devices, and P...",Engineering and Information Technology,Not Applicable,Johnson & Johnson,1207.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",,"Ethicon, part of Johnson & Johnson Medical Dev...",bachelor degree,"Santa Clara, CA",96.0,2632810866,https://www.linkedin.com/jobs/view/principal-f...,,,,Principal Full Stack Software Engineer.
2,Full-time,"Computer Hardware, Computer Software, and Info...",Engineering and Information Technology,Not Applicable,Microsoft,1035.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",,Microsoft’s WCB health team is looking for a S...,bachelor degree,"Washington, DC",84.0,2632812746,https://www.linkedin.com/jobs/view/senior-soft...,189000.0,120000.0,"$120,000.00/yr - $189,000.00/yr",Senior Software Engineer
3,Full-time,"Computer Hardware, Computer Software, and Info...",Engineering and Information Technology,Not Applicable,Microsoft,1035.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",,Microsoft’s WCB health team is looking for a S...,bachelor degree,"Reston, VA",84.0,2632817427,https://www.linkedin.com/jobs/view/senior-soft...,189000.0,120000.0,"$120,000.00/yr - $189,000.00/yr",Senior Software Engineer
4,Full-time,"Computer Hardware, Computer Software, and Info...",Engineering and Information Technology,Not Applicable,Microsoft,1035.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",,Microsoft’s WCB health team is looking for a S...,bachelor degree,"Irving, TX",84.0,2632817426,https://www.linkedin.com/jobs/view/senior-soft...,175000.0,108000.0,"$108,000.00/yr - $175,000.00/yr",Senior Software Engineer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8256,Contract,,,,Sky Solutions,18682633.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",2021-10-17 23:50:58.294116,Primary Skills: 1. Good hands on experience on...,bachelor degree,"Malvern, PA",,2750535238,https://www.linkedin.com/jobs/view/site-reliab...,,,,Site Reliability Engineer
8257,Full-time,"Marketing and Advertising, Computer Software, ...",Information Technology,Associate,Zillow,13990.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",2021-10-17 23:51:01.950194,About The Team The SRE team at Zillow Group em...,bachelor degree,United States,60.0,2743905746,https://www.linkedin.com/jobs/view/senior-site...,,,,Senior Site Reliability Engineer
8258,Full-time,"Computer Software, Consumer Services, and Ente...",Engineering and Information Technology,Not Applicable,PlayStation,1254.0,"{""@context"": ""http://schema.org"", ""@type"": ""Jo...",2021-10-17 23:51:05.271522,PlayStation isn’t just the Best Place to Play ...,bachelor degree,"San Diego, CA",84.0,2740102323,https://www.linkedin.com/jobs/view/sr-site-rel...,162000.0,129000.0,"$129,000.00/yr - $162,000.00/yr",Sr. Site Reliability Engineer
8259,,,,,Flowspace,,,2021-10-17 23:51:08.758745,,,"Los Angeles, CA",,2754349591,https://www.linkedin.com/jobs/view/site-reliab...,,,,Site Reliability / DevOps Engineer (Remote)


In [9]:
# Read and prepare the data
df = pd.read_csv(r'C:\Users\LENOVO\Downloads\Linkedin-analysis-project\graphs\jobs.csv')

# Clean salary data
df['sal_low'] = pd.to_numeric(df['sal_low'], errors='coerce')
df['sal_high'] = pd.to_numeric(df['sal_high'], errors='coerce')
df['avg_salary'] = (df['sal_low'] + df['sal_high']) / 2

# Format salary for better display
df['formatted_salary'] = df['avg_salary'].apply(lambda x: f'${x:,.0f}')

In [15]:
# Get top 10 job functions by average salary
top_10_jobs = df.groupby('Job function')['avg_salary'].mean().sort_values(ascending=False).head(10)

# Create bar chart for top 10 job functions
fig1 = px.bar(
    x=top_10_jobs.index,
    y=top_10_jobs.values,
    title='Top 10 Highest Paying Job Functions',
    template='plotly_white',
    color=top_10_jobs.values,
    color_continuous_scale='RdYlBu_r'  # Changed to a more professional color scale
)

# Update layout for better readability and aesthetics
fig1.update_layout(
    title={
        'text': 'Top 10 Highest Paying Job Functions',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=28, color='#1f1f1f', family='Arial Black')
    },
    xaxis=dict(
        title='Job Function',
        tickangle=-45,
        tickfont=dict(size=12, family='Arial'),
        title_font=dict(size=16, color='#1f1f1f', family='Arial'),
        gridcolor='#f0f0f0',
        zerolinecolor='#f0f0f0'
    ),
    yaxis=dict(
        title='Average Salary (USD)',
        title_font=dict(size=16, color='#1f1f1f', family='Arial'),
        tickfont=dict(size=12, family='Arial'),
        gridcolor='#f0f0f0',
        zerolinecolor='#f0f0f0',
        tickformat=',.0f'  # Format y-axis ticks with commas
    ),
    showlegend=False,
    margin=dict(t=120, b=100, l=100, r=50),
    plot_bgcolor='white',
    paper_bgcolor='white',
    height=600,  # Increased height for better visibility
    width=1000   # Increased width for better spacing
)

# Update traces for better appearance
fig1.update_traces(
    marker=dict(
        line=dict(width=1.5, color='#ffffff'),  # White border around bars
        opacity=0.85  # Slightly transparent for softer look
    ),
    text=top_10_jobs.values.round(0),
    texttemplate='$%{text:,.0f}',
    textposition='outside',
    textfont=dict(size=12, family='Arial', color='#1f1f1f')
)

# Add a subtle grid
fig1.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')
fig1.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')

fig1.show()

In [12]:
# Enhanced Violin Plot – Salary Distribution by Seniority Level
fig2 = px.violin(df, x='Seniority level', y='avg_salary',
                box=True,
                points='outliers',
                title='Salary Distribution by Seniority Level',
                color='Seniority level',
                color_discrete_sequence=custom_colors)

fig2.update_layout(
    title_x=0.5,
    title_font_size=20,
    showlegend=False,
    xaxis_title='Seniority Level',
    yaxis_title='Salary Distribution (USD)',
    xaxis_tickangle=-45,
    height=600,
    yaxis_tickformat='$,.0f',
    plot_bgcolor='white',
    hoverlabel=dict(bgcolor='white'),
    margin=dict(t=100)
)

fig2.show()

In [18]:
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

# Clean the data by removing inf and NaN values
clean_salary = df['avg_salary'].replace([np.inf, -np.inf], np.nan).dropna()

# Enhanced Histogram – Salary Distribution
fig3 = px.histogram(clean_salary, x='avg_salary',
                   nbins=50,
                   title='Salary Distribution Across All Jobs',
                   color_discrete_sequence=['#2c3e50'],
                   opacity=0.8)

# Update layout for better readability and aesthetics
fig3.update_layout(
    title={
        'text': 'Salary Distribution Across All Jobs',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=28, color='#1f1f1f', family='Arial Black')
    },
    xaxis=dict(
        title='Salary (USD)',
        title_font=dict(size=16, color='#1f1f1f', family='Arial'),
        tickfont=dict(size=12, family='Arial'),
        tickformat=',.0f',
        gridcolor='#f0f0f0',
        zerolinecolor='#f0f0f0'
    ),
    yaxis=dict(
        title='Number of Jobs',
        title_font=dict(size=16, color='#1f1f1f', family='Arial'),
        tickfont=dict(size=12, family='Arial'),
        gridcolor='#f0f0f0',
        zerolinecolor='#f0f0f0'
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    bargap=0.1,
    showlegend=False,
    margin=dict(t=120, b=80, l=80, r=50),
    height=600,
    width=1000
)

# Add a kernel density estimate curve with improved styling
fig3.add_trace(
    go.Scatter(
        x=clean_salary.sort_values(),
        y=stats.gaussian_kde(clean_salary)(clean_salary.sort_values()) * len(clean_salary) * (clean_salary.max() - clean_salary.min()) / 50,
        mode='lines',
        name='Density Curve',
        line=dict(
            color='#e74c3c',
            width=3,
            dash='solid'
        ),
        fill='tozeroy',
        fillcolor='rgba(231, 76, 60, 0.1)'
    )
)

# Add a legend for the density curve
fig3.update_layout(
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='#f0f0f0',
        borderwidth=1,
        font=dict(size=12, family='Arial')
    )
)

# Add a subtle grid
fig3.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')
fig3.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')

# Add hover template for better interactivity
fig3.update_traces(
    hovertemplate="<b>Salary Range:</b> $%{x:,.0f}<br>" +
                  "<b>Number of Jobs:</b> %{y}<br>" +
                  "<extra></extra>"
)

fig3.show()

In [23]:
# Enhanced Scatter Plot – Experience vs. Salary
fig4 = px.scatter(df, x='months_experience', y='avg_salary',
                 title='Experience vs. Salary Correlation',
                 color='Job function',
                 size='avg_salary',
                 size_max=20,  # Increased max size for better visibility
                 trendline='ols',
                 color_discrete_sequence=px.colors.qualitative.Set3,  # Pleasant color palette
                 hover_data=['title', 'formatted_salary'])

# Update layout for better readability and aesthetics
fig4.update_layout(
    title={
        'text': 'Experience vs. Salary Correlation',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=28, color='#1f1f1f', family='Arial Black')
    },
    xaxis=dict(
        title='Experience (Months)',
        title_font=dict(size=16, color='#1f1f1f', family='Arial'),
        tickfont=dict(size=12, family='Arial'),
        gridcolor='#f0f0f0',
        zerolinecolor='#f0f0f0',
        showgrid=True
    ),
    yaxis=dict(
        title='Salary (USD)',
        title_font=dict(size=16, color='#1f1f1f', family='Arial'),
        tickfont=dict(size=12, family='Arial'),
        tickformat=',.0f',
        gridcolor='#f0f0f0',
        zerolinecolor='#f0f0f0',
        showgrid=True
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    height=700,  # Increased height
    width=1000,  # Increased width
    margin=dict(t=120, b=80, l=80, r=150),  # Adjusted margins
    hoverlabel=dict(
        bgcolor='white',
        font_size=12,
        font_family='Arial'
    )
)

# Update legend
fig4.update_layout(
    showlegend=True,
    legend=dict(
        title='Job Function',
        title_font=dict(size=14, family='Arial', color='#1f1f1f'),
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.02,
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='#f0f0f0',
        borderwidth=1,
        font=dict(size=12, family='Arial')
    )
)

# Update traces for better appearance
fig4.update_traces(
    marker=dict(
        line=dict(width=1, color='#ffffff'),
        opacity=0.7
    ),
    selector=dict(mode='markers')
)

# Update trendline appearance
fig4.update_traces(
    line=dict(width=3, color='#2c3e50'),
    selector=dict(mode='lines')
)

# Add a subtle grid
fig4.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')
fig4.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')

# Customize hover template
fig4.update_traces(
    hovertemplate="<b>Experience:</b> %{x} months<br>" +
                  "<b>Salary:</b> $%{y:,.0f}<br>" +
                  "<b>Job Title:</b> %{customdata[0]}<br>" +
                  "<b>Formatted Salary:</b> %{customdata[1]}<br>" +
                  "<extra></extra>"
)

fig4.show()

ValueError: 
    Invalid element(s) received for the 'size' property of scattergl.marker
        Invalid elements include: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]

    The 'size' property is a number and may be specified as:
      - An int or float in the interval [0, inf]
      - A tuple, list, or one-dimensional numpy array of the above

In [None]:
# Enhanced Bar Chart – Top 15 Highest Paying Jobs
title_salary = df.groupby('title')['avg_salary'].mean().nlargest(15).reset_index()

fig5 = px.bar(title_salary,
             x='avg_salary',
             y='title',
             orientation='h',
             title='Top 15 Highest Paying Job Titles',
             color='avg_salary',
             color_continuous_scale='viridis',
             text=title_salary['avg_salary'].apply(lambda x: f'${x:,.0f}'))

fig5.update_layout(
    title_x=0.5,
    title_font_size=20,
    xaxis_title='Average Salary (USD)',
    yaxis_title='Job Title',
    height=700,
    xaxis_tickformat='$,.0f',
    plot_bgcolor='white',
    hoverlabel=dict(bgcolor='white'),
    margin=dict(l=200, t=100, r=100)
    showlegend=False
)

fig5.update_traces(
    textposition='auto',
    hovertemplate='<b>%{y}</b><br>Salary: %{text}<extra></extra>'
)

fig5.show()