# HR Salary Predictor
This notebook trains a simple linear regression model to predict salaries based on candidate **experience**, **written test score**, and **personal interview score**.

It performs minimal preprocessing (parsing textual experience values and imputing missing values) and then fits a `LinearRegression` model.


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

csv_path = 'hiring.csv'
df = pd.read_csv('/kaggle/input/hr-salary-predictor/hiring.csv')
df.head()

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),salary($)
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


# Parse textual/varied experience into numeric years

In [3]:
word_to_num = {
    'zero':0,'one':1,'two':2,'three':3,'four':4,'five':5,
    'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12
}
import pandas as pd
import numpy as np

def parse_experience(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return float(x)
    s = str(x).strip().lower()
    try:
        return float(s)
    except Exception:
        pass
    for token in ['yrs','yr','years','year','experience','exp',' ']:
        s = s.replace(token, '')
    s = s.strip()
    try:
        return float(s)
    except Exception:
        pass
    return float(word_to_num.get(s, np.nan))


# Auto-detect common column names

In [4]:
cols = [c.lower().strip() for c in df.columns]
colmap = {}
for c in df.columns:
    cl = c.lower().strip()
    if 'exp' in cl or 'experience' in cl:
        colmap['experience'] = c
    elif 'test' in cl:
        colmap['test_score'] = c
    elif 'interview' in cl:
        colmap['interview_score'] = c
    elif 'salary' in cl:
        colmap['salary'] = c
required = ['experience','test_score','interview_score','salary']
missing = [r for r in required if r not in colmap]
if missing:
    raise ValueError(f'Missing required columns: {missing}. Found mapping: {colmap}')

X_raw = df[[colmap['experience'], colmap['test_score'], colmap['interview_score']]].copy()
y = df[colmap['salary']].copy()
X_raw[colmap['experience']] = X_raw[colmap['experience']].apply(parse_experience)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

numeric_features = [colmap['experience'], colmap['test_score'], colmap['interview_score']]
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])
preprocess = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])
model = Pipeline(steps=[('preprocess', preprocess), ('regressor', LinearRegression())])
model.fit(X_raw, y)
model

# Predict for new candidates

In [5]:
candidates = pd.DataFrame({
    colmap['experience']: [2, 12],
    colmap['test_score']: [9, 10],
    colmap['interview_score']: [6, 10],
})
candidates[colmap['experience']] = candidates[colmap['experience']].apply(parse_experience)
preds = model.predict(candidates)
candidates.assign(predicted_salary=preds)

Unnamed: 0,experience,test_score(out of 10),interview_score(out of 10),predicted_salary
0,2.0,9,6,47738.891043
1,12.0,10,10,86424.667959


## Notes
- The model is a simple Ordinary Least Squares linear regression.
- We impute missing numeric values using the mean of each column.