In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv('EmployeeSkills_NonNormalized_Extended_Tanjeem.csv')
df.head(5)

Unnamed: 0,experience_ears,education_level,num_certifications,tech_score,communication_score,leadership_score,teamwork_score,prblm_solvng,projects_led,client_feedback,annual_salary
0,19,2.0,3.0,59.21232364,61.272205,20.884042,69.107361,26.269763,12,3.816695,28444.02447
1,32,2.0,9.0,81.95179187,22.557994,19.679013,52.109618,4.471027,6,0.1107,36040.61839
2,35,2.0,7.0,82.53015623,90.465606,64.257992,97.693142,26.800898,8,2.632624,35508.44398
3,6,,9.0,31.11206808,85.314532,2.413278,49.123878,66.172067,14,4.418793,10243.62441
4,27,,4.0,65.87861177,30.659908,73.067201,74.999043,71.729068,0,4.111955,34315.16008


In [4]:
print("Original data count:", df.shape)

df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(' ', '_')
              .str.replace(r'[^0-9a-z_]', '', regex=True))

df.rename(columns={
    'experience_ears': 'experience_years',
    'prblm_solvng': 'problem_solving'
}, inplace=True)

text_to_num = {
    'zero':0,'one':1,'two':2,'three':3,'four':4,'five':5,
    'six':6,'seven':7,'eight':8,'nine':9,'ten':10
}
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].replace(text_to_num)
    df[col] = pd.to_numeric(df[col], errors= 'coerce')
    
df.dropna(inplace=True)

print("Count after cleaning data:", df.shape)

for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    mask = df[col].between(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
    df = df[mask]

print("Count after removal of outlier:", df.shape)

Original data count: (415, 11)
Count after cleaning data: (404, 11)
Count after removal of outlier: (390, 11)


In [5]:
x = df.drop('annual_salary', axis=1)
y = df['annual_salary']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1
)

model = LinearRegression()
model.fit(x_train, y_train)
print(f"Regression model built using {model} on cleaned dataset with 80/20 split in training and testing.")

Regression model built using LinearRegression() on cleaned dataset with 80/20 split in training and testing.


In [6]:
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print(f"Test R² score: {r2:.4f}")
print(f"– This means the model explains {r2*100:.2f}% of the variance in the annual salaries")

Test R² score: 0.8297
– This means the model explains 82.97% of the variance in the annual salaries
