<a href="https://colab.research.google.com/github/savitha14june/ai-impact-2030/blob/main/ai_impact_on_jobs_DA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

AI Impact on job market 2030

In [2]:
import pandas as pd
#Raw file URL from Github
raw_file_url = 'https://raw.githubusercontent.com/savitha14june/ai-impact-2030/refs/heads/main/AI_Impact_on_Jobs_2030.csv'

#Read csv file
df = pd.read_csv(raw_file_url)

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Job_Title                    3000 non-null   object 
 1   Average_Salary               3000 non-null   int64  
 2   Years_Experience             3000 non-null   int64  
 3   Education_Level              3000 non-null   object 
 4   AI_Exposure_Index            3000 non-null   float64
 5   Tech_Growth_Factor           3000 non-null   float64
 6   Automation_Probability_2030  3000 non-null   float64
 7   Risk_Category                3000 non-null   object 
 8   Skill_1                      3000 non-null   float64
 9   Skill_2                      3000 non-null   float64
 10  Skill_3                      3000 non-null   float64
 11  Skill_4                      3000 non-null   float64
 12  Skill_5                      3000 non-null   float64
 13  Skill_6           

Data cleanup

In [3]:
#Remove duplicates
df.drop_duplicates(inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Job_Title                    3000 non-null   object 
 1   Average_Salary               3000 non-null   int64  
 2   Years_Experience             3000 non-null   int64  
 3   Education_Level              3000 non-null   object 
 4   AI_Exposure_Index            3000 non-null   float64
 5   Tech_Growth_Factor           3000 non-null   float64
 6   Automation_Probability_2030  3000 non-null   float64
 7   Risk_Category                3000 non-null   object 
 8   Skill_1                      3000 non-null   float64
 9   Skill_2                      3000 non-null   float64
 10  Skill_3                      3000 non-null   float64
 11  Skill_4                      3000 non-null   float64
 12  Skill_5                      3000 non-null   float64
 13  Skill_6           

In [4]:
#Check unique values
print(df['Job_Title'].unique())
print(df['Education_Level'].unique())
print(df['Risk_Category'].unique())

['Security Guard' 'Research Scientist' 'Construction Worker'
 'Software Engineer' 'Financial Analyst' 'AI Engineer' 'Mechanic'
 'Teacher' 'HR Specialist' 'Customer Support' 'UX Researcher' 'Lawyer'
 'Data Scientist' 'Graphic Designer' 'Retail Worker' 'Doctor'
 'Truck Driver' 'Chef' 'Nurse' 'Marketing Manager']
["Master's" 'PhD' 'High School' "Bachelor's"]
['High' 'Low' 'Medium']


In [5]:
#Write df to cleanup file
df.to_csv('ai_impact_data_cleanup.csv', index=False)

Graph on Automation Probability Distribution by Risk Category

In [6]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

#Automation Probability Distribution by Risk Category'

automation_range_by_risk = df.groupby('Risk_Category')['Automation_Probability_2030'].agg(['min', 'max']).reset_index()
automation_range_by_risk.rename(columns={'min': 'Min_Automation_Probability_2030', 'max': 'Max_Automation_Probability_2030'}, inplace=True)
display(automation_range_by_risk)

#Box chart on Automation Probability Distribution by Risk Category
figure = px.box(
    df,
    x='Risk_Category',
    y='Automation_Probability_2030',
    color='Risk_Category',
    title='Automation Probability Distribution by Risk Category',
    labels={'x': 'Risk Category', 'y': 'Automation Probability 2030'}
)
figure.show()

Unnamed: 0,Risk_Category,Min_Automation_Probability_2030,Max_Automation_Probability_2030
0,High,0.71,0.95
1,Low,0.05,0.3
2,Medium,0.31,0.7


Graph on min and max automation probability for each Job_Title

In [11]:

#Calculation of min and max automation probability for each Job_Title
automation_probability_stats = df.groupby('Job_Title')['Automation_Probability_2030'].agg(['min', 'max']).reset_index()
automation_probability_stats.rename(columns={'min': 'Min_Automation_Probability_2030', 'max': 'Max_Automation_Probability_2030'}, inplace=True)

#Melt df for grouped bar chart
automation_stats_melted = automation_probability_stats.melt(
    id_vars=['Job_Title'],
    value_vars=['Min_Automation_Probability_2030', 'Max_Automation_Probability_2030'],
    var_name='Automation_Type',
    value_name='Automation_Probability'
)

#Bar chart on Min and Max Automation Probability for Each Job Title
figure = px.bar(
    automation_stats_melted,
    x='Job_Title',
    y='Automation_Probability',
    color='Automation_Type',
    barmode='group',
    title='Min and Max Automation Probability for Each Job Title',
    labels={'Job_Title': 'Job Title', 'Automation_Probability': 'Automation Probability 2030'}
)
figure.update_layout(xaxis_tickangle=-45)
figure.show()

Graph on Automation Probability vs. Tech Growth Factor by Risk Category

In [9]:
#Correlation by risk category
correlation_by_risk_category = df.groupby('Risk_Category')[['Tech_Growth_Factor', 'Automation_Probability_2030']].corr().unstack().iloc[:, 1]
print("Correlation between 'Tech_Growth_Factor' and 'Automation_Probability_2030' per Risk Category:")
print(correlation_by_risk_category)

Correlation between 'Tech_Growth_Factor' and 'Automation_Probability_2030' per Risk Category:
Risk_Category
High      0.018702
Low      -0.008785
Medium    0.017718
Name: (Tech_Growth_Factor, Automation_Probability_2030), dtype: float64


In [14]:
unique_categories = ['Low Risk', 'Medium Risk', 'High Risk']
#Scatter plot on Automation Probability and Tech Growth Factor
fig = px.scatter(
    df,
    x='Tech_Growth_Factor',
    y='Automation_Probability_2030',
    color='Risk_Category',
    title='Interactive Plot on Automation Probability and Tech Growth Factor',
    hover_data=['Tech_Growth_Factor', 'Automation_Probability_2030', 'Risk_Category']
)

#Range Slider to the X-Axis
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=6, label="50%", step="month", stepmode="backward"),
            dict(step="all")
        ])
    )
)

#Dropdown for Risk_Category
dropdown_risks = []
num_traces = len(unique_categories)

dropdown_risks.append(dict(
    method='restyle',
    args=[{'visible': [True] * num_traces}],
    label='All Categories'
))

for i, category in enumerate(unique_categories):
    visibility = [False] * num_traces
    visibility[i] = True

    dropdown_risks.append(dict(
        method='restyle',
        args=[{'visible': visibility}],
        label=category
    ))


#Dropdown and Custom Button
fig.update_layout(
    xaxis_title='Tech Growth Factor',
    yaxis_title='Automation Probability 2030',
    margin=dict(l=20, r=20, t=100, b=20),

    #Updated menu with Dropdown and Buttons
    updatemenus=[

        dict(
            type='dropdown',
            direction='down',
            x=0.15,
            y=1.15,
            showactive=True,
            buttons=dropdown_risks
        ),
        #Button to reset axes
        dict(
            type='buttons',
            direction='right',
            x=0.4,
            y=1.15,
            showactive=True,
            buttons=[
                dict(
                    args=[{'xaxis.autorange': True, 'yaxis.autorange': True}],
                    label='Reset Axes',
                    method='relayout'
                )
            ]
        )
    ]
)

fig.show()