## Feature engineering

In [4]:
data = pd.read_csv("AI_Impact_on_Jobs_2030.csv")
data.head()


<IPython.core.display.Javascript object>

Unnamed: 0,Job_Title,Average_Salary,Years_Experience,Education_Level,AI_Exposure_Index,Tech_Growth_Factor,Automation_Probability_2030,Risk_Category,Skill_1,Skill_2,Skill_3,Skill_4,Skill_5,Skill_6,Skill_7,Skill_8,Skill_9,Skill_10
0,Security Guard,45795,28,Master's,0.18,1.28,0.85,High,0.45,0.1,0.46,0.33,0.14,0.65,0.06,0.72,0.94,0.0
1,Research Scientist,133355,20,PhD,0.62,1.11,0.05,Low,0.02,0.52,0.4,0.05,0.97,0.23,0.09,0.62,0.38,0.98
2,Construction Worker,146216,2,High School,0.86,1.18,0.81,High,0.01,0.94,0.56,0.39,0.02,0.23,0.24,0.68,0.61,0.83
3,Software Engineer,136530,13,PhD,0.39,0.68,0.6,Medium,0.43,0.21,0.57,0.03,0.84,0.45,0.4,0.93,0.73,0.33
4,Financial Analyst,70397,22,High School,0.52,1.46,0.64,Medium,0.75,0.54,0.59,0.97,0.61,0.28,0.3,0.17,0.02,0.42


In [5]:
## step 1: correcting incorrect data types 

data['Years_Experience'] = data['Years_Experience'].astype(int)
data['Education_Level'] = data['Education_Level'].astype('category')


In [6]:
## step 2: remove outliers
## As we see in EDA, there are no significant highs and lows, so no need to remove outlier values. Data is normalised

In [7]:
## step 3: ENCODING ORDINAL FEATURES (where rank matters )

import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder

le = LabelEncoder()
data['Education_Level'] = le.fit_transform(data['Education_Level'])

<IPython.core.display.Javascript object>

### Encode:
1. High School = 0 
2. Bachelor's = 1
3. Master's = 2
4. PHD = 3

In [9]:
## step 4: ONE HOT-ENCODE FOR NON_ORDINAL FEATURES(where ranks doesn't matter)

data = pd.get_dummies(data, columns=["Job_Title"], drop_first = True)

<IPython.core.display.Javascript object>

In [10]:
## step 5: Normalisation or scaling of numeric features
numeric_features = ["Average_Salary","AI_Exposure_Index","Tech_Growth_Factor","Years_Experience"]

scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

In [11]:
## step 6: creation of new features 

In [12]:
## Calculation of Total_skill_score

skills = [f"Skill_{i}" for i in range(1,11)]
data["total_skill_score"] = data[skills].sum(axis=1)

In [13]:
data

Unnamed: 0,Average_Salary,Years_Experience,Education_Level,AI_Exposure_Index,Tech_Growth_Factor,Automation_Probability_2030,Risk_Category,Skill_1,Skill_2,Skill_3,...,Job_Title_Mechanic,Job_Title_Nurse,Job_Title_Research Scientist,Job_Title_Retail Worker,Job_Title_Security Guard,Job_Title_Software Engineer,Job_Title_Teacher,Job_Title_Truck Driver,Job_Title_UX Researcher,total_skill_score
0,-1.259374,1.524586,2,-1.131450,0.989695,0.85,High,0.45,0.10,0.46,...,False,False,False,False,True,False,False,False,False,3.85
1,1.271092,0.609079,3,0.418080,0.398639,0.05,Low,0.02,0.52,0.40,...,False,False,True,False,False,False,False,False,False,4.26
2,1.642772,-1.450811,1,1.263278,0.642015,0.81,High,0.01,0.94,0.56,...,False,False,False,False,False,False,False,False,False,4.51
3,1.362849,-0.191989,3,-0.391902,-1.096386,0.60,Medium,0.43,0.21,0.57,...,False,False,False,False,False,True,False,False,False,4.92
4,-0.548382,0.837956,1,0.065914,1.615519,0.64,Medium,0.75,0.54,0.59,...,False,False,False,False,False,False,False,False,False,4.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.634256,-0.993058,0,-0.920151,0.642015,0.20,Low,0.73,0.37,0.99,...,False,False,False,False,False,False,False,False,False,5.35
2996,-1.300759,1.639024,3,0.523729,-0.887778,0.35,Medium,0.23,0.48,0.05,...,False,False,False,False,False,False,False,False,True,4.79
2997,-0.810561,0.952394,2,0.488513,-0.192418,0.39,Medium,0.28,0.62,0.73,...,False,False,False,False,False,False,False,False,False,4.85
2998,0.604691,-0.878619,3,1.580227,0.815855,0.46,Medium,0.21,0.18,0.14,...,False,False,False,False,False,False,False,False,False,3.88


In [14]:
## AI Risk Score 
## (it will depend on AI exposure on job sectors and automation probability score, high ai exposure high risk of job unstability)

data["Ai_risk_score"] = (data['AI_Exposure_Index'] * 0.6 + 
    data['Automation_Probability_2030'] *0.4)
    

### Explanation:
1. Why do we take more weightage of ai_exposure_index as compared to a ? As ai_exposure_index measures how exposed a job is to AI , so that means higher weight - more important 
2. On the other hand, automation_probability_2030 estimating risk of automation in the near future so slight lower weight taken for the score.


In [16]:
## Job Stability Score
## High Tech Growth = More Stable Job

data["job_Stablity"] = data['Tech_Growth_Factor'] - data['AI_Exposure_Index']

### Explanation: 
1. Tech_Growth_Factor → represents how much the job benefits from technological progress.
2. AI_Exposure_Index → represents how vulnerable the job is to AI automation.
3. Job Stability = Benefits from tech − Risk from AI: Positive score → Stable job, Around 0 → Neutral stability, Negative score → Unstable job

In [18]:
## Experience-Salary-Relation
## More experience, Direct impact on salary and job growth.

data["exp_salary"] = data['Years_Experience']*data['Average_Salary']

### Explanation:
1. High Value means High experience and high salary, usually senior professionals.
2. Low values mean: Low experience or low salary

In [20]:
## Skill_Efficiency 
## Relation between experience to get effcienct in skills

data["skill_eff"] = data['total_skill_score']/(data['Years_Experience']+1)

### Explanation:
1. Total_Skill_Score: Represents how skilled a person is (sum of all skill ratings)
2. Years_Experience: Represents how long they’ve been in the field
3. +1: Prevents division by zero for people with 0 years of experience, and smooths the ratio.
4. Skill Efficiency, This metric answers: “How much skill per year of experience does someone have?”

In [22]:
## encoding numerical features for Model training
risk_map = {"Low": 0, "Medium": 1, "High": 2}
data["Risk_Category"] = data["Risk_Category"].map(risk_map)




In [23]:
## encoding numerical features for Model training
bool_cols = data.select_dtypes(include='bool').columns
data[bool_cols] = data[bool_cols].astype(int)

In [24]:
import os

# Create ML folder if it does not exist
os.makedirs("ML", exist_ok=True)

# Save the feature-engineered dataset
data.to_csv("feature_engineered_data.csv", index=False)

print("Feature engineered data saved to ML/feature_engineered_data.csv")


Feature engineered data saved to ML/feature_engineered_data.csv
