In [29]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [30]:
df = pd.read_csv('ai_impact_jobs_2010_2025(2).csv')

In [31]:
df

Unnamed: 0,posting_year,city,company_size,industry,job_title,seniority_level,ai_intensity_score,salary_usd,automation_risk_score,ai_job_displacement_risk,industry_ai_adoption_stage
0,2018,London,Small,Education,Policy Analyst,Lead,0.81,61586,0.11,Low,Growing
1,2015,Singapore,Medium,Energy,Data Scientist,Executive,0.04,62045,0.71,High,Emerging
2,2016,Sydney,Startup,Finance,Product Manager,Junior,0.15,27035,0.86,High,Emerging
3,2015,Nairobi,Large,Government,Data Scientist,Mid,0.19,72894,0.70,Low,Emerging
4,2014,Sydney,Small,Manufacturing,ML Engineer,Lead,0.11,57215,0.87,High,Emerging
...,...,...,...,...,...,...,...,...,...,...,...
4995,2022,Bangalore,Enterprise,Education,ML Engineer,Intern,0.12,41317,0.88,Low,Growing
4996,2018,Sydney,Large,Healthcare,Policy Analyst,Senior,0.07,37089,0.79,Low,Growing
4997,2019,Bangalore,Large,Education,ML Engineer,Mid,0.05,81651,0.64,Low,Growing
4998,2016,Sydney,Enterprise,Energy,AI Researcher,Junior,0.87,107897,0.33,Low,Emerging


# Target Variables - 
>1. automation_risk_score
>2. ai_job_displacement_risk
>3. industry_ai_adoption_stage

# 1. industry_ai_adoption_stage

In [32]:
leakage_cols = [
    "industry_ai_adoption_stage",
    "industry_ai_adoption_score",
    "automation_risk_score",
    "ai_job_displacement_risk",
    "ai_exposure_score"
]

X_base = df.drop(columns=leakage_cols, errors="ignore")
y_adoption = df["industry_ai_adoption_stage"]

cat_features = X_base.select_dtypes(include=["object"]).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X_base,
    y_adoption,
    test_size=0.2,
    random_state=42,
    stratify=y_adoption
)

model_adoption = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    loss_function="MultiClass",
    random_seed=42,
    verbose=False
)

model_adoption.fit(X_train, y_train, cat_features=cat_features)

y_pred = model_adoption.predict(X_test).ravel()

print("\n=== MODEL 1: INDUSTRY AI ADOPTION STAGE ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



=== MODEL 1: INDUSTRY AI ADOPTION STAGE ===
Accuracy: 1.0
              precision    recall  f1-score   support

    Emerging       1.00      1.00      1.00       439
     Growing       1.00      1.00      1.00       501
      Mature       1.00      1.00      1.00        60

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

Confusion Matrix:
 [[439   0   0]
 [  0 501   0]
 [  0   0  60]]


# 2. automation_risk_score

In [33]:
X_auto = X_base.copy()
X_auto["industry_ai_adoption_stage"] = model_adoption.predict(X_base).ravel()

y_auto = df["automation_risk_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X_auto,
    y_auto,
    test_size=0.2,
    random_state=42
)

model_automation = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function="RMSE",
    random_seed=42,
    verbose=False
)

model_automation.fit(
    X_train,
    y_train,
    cat_features=cat_features + ["industry_ai_adoption_stage"]
)

y_pred = model_automation.predict(X_test)

print("\n=== MODEL 2: AUTOMATION RISK SCORE ===")
print("R2:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))



=== MODEL 2: AUTOMATION RISK SCORE ===
R2: 0.8848952586627008
MAE: 0.07518706330322689
RMSE: 0.08761116008969894


# 3. ai_job_displacement_risk

In [34]:
X_disp = X_auto.copy()
X_disp["automation_risk_score"] = model_automation.predict(X_auto)

y_disp = df["ai_job_displacement_risk"]

X_train, X_test, y_train, y_test = train_test_split(
    X_disp,
    y_disp,
    test_size=0.2,
    random_state=42,
    stratify=y_disp
)

model_displacement = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    loss_function="MultiClass",
    random_seed=42,
    verbose=False
)

model_displacement.fit(
    X_train,
    y_train,
    cat_features=cat_features + ["industry_ai_adoption_stage"]
)

y_pred = model_displacement.predict(X_test).ravel()

print("\n=== MODEL 3: AI JOB DISPLACEMENT RISK ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



=== MODEL 3: AI JOB DISPLACEMENT RISK ===
Accuracy: 0.327
              precision    recall  f1-score   support

        High       0.32      0.27      0.29       325
         Low       0.34      0.35      0.34       344
      Medium       0.33      0.36      0.34       331

    accuracy                           0.33      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.33      0.33      0.33      1000

Confusion Matrix:
 [[ 88 124 113]
 [ 90 119 135]
 [ 99 112 120]]


# Prediction Pipeline

In [35]:
def predict_all_outputs(user_input):
    input_df = pd.DataFrame(
        [{col: user_input.get(col, None) for col in X_base.columns}]
    )

    adoption = model_adoption.predict(input_df).ravel()
    input_df["industry_ai_adoption_stage"] = adoption

    automation = model_automation.predict(input_df)
    input_df["automation_risk_score"] = automation

    displacement = model_displacement.predict(input_df).ravel()

    return {
        "industry_ai_adoption_stage": adoption[0],
        "automation_risk_score": float(automation[0]),
        "ai_job_displacement_risk": displacement[0]
    }


# Inputs

In [36]:
user_input = {
    "posting_year": 2025,
    "city": "Bangalore",
    "company_size": "Large",
    "industry": "Tech",
    "job_title": "ML Engineer",
    "seniority_level": "Mid",
    "ai_intensity_score": 0.50,
    "salary_usd": 30000
}


# Outputs

In [37]:
outputs = predict_all_outputs(user_input)
print(outputs)


{'industry_ai_adoption_stage': 'Mature', 'automation_risk_score': 0.23331677835692277, 'ai_job_displacement_risk': 'Medium'}
