In [1]:
import pandas
import numpy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,OneHotEncoder,RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib

In [2]:
data = pandas.read_csv("salary.csv")
data.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [3]:
print(data.head(6).dtypes)


Age                    float64
Gender                  object
Education Level         object
Job Title               object
Years of Experience    float64
Salary                 float64
dtype: object


In [4]:
numerical = ["Age","Years of Experience"]
categorical = ["Education Level" , "Job Title" , "Gender"]
num_pipeline = Pipeline(steps = [
    ("numberical",SimpleImputer(strategy="median")),
    ("scaler" , StandardScaler())
])
Cat_pipeline = Pipeline(steps = [
    ("Categorical" , SimpleImputer(strategy="most_frequent")),
    ("scaler" , OneHotEncoder(handle_unknown="ignore"))
])
processing = ColumnTransformer(transformers=[
    ("numerical" , num_pipeline , numerical),
    ("categorical" , Cat_pipeline , categorical)
],remainder="passthrough")
final_processing = processing.fit_transform(data)

In [5]:
y = data["Salary"]

In [6]:
y.fillna(y.mean(),inplace = True)

In [7]:
data.drop(columns=["Salary"],inplace=True)

In [8]:
X_train,X_test,y_train,y_test = train_test_split(data,y , random_state=42 , test_size=0.2)
model_pipeline = Pipeline(steps = [
    ("transform" , processing),
    ("model" ,LinearRegression())
])
model = model_pipeline.fit(X_train,y_train)

In [9]:
model.predict(X_test)

array([141782.97700548,  98816.61455587, 121327.78530281,  99999.86585394,
       132725.28078689, 180309.26328634, 144380.618892  , 119027.53759512,
        45686.57274596,  97114.89595455, 134351.13469679, 141212.58042091,
        46894.33356224,  86598.67114915,  56704.38749067, 153501.33786473,
        75421.73209572,  61888.94340919,  96178.58390847,  91787.35353617,
       118182.53340096, 110986.49123899, 135988.56290554,  52831.57024068,
       101640.79317151,  58692.58935972, 194203.04462551, 127922.59951498,
       151853.16562486, 155650.8001641 ,  62339.88560138, 114365.49949556,
        67467.81404848, 144265.35806131,  94551.61553047,  30283.22805188,
       169067.12508825,  65650.35897215,  57012.34318299,  52841.3659418 ,
        46410.69371654,  90404.48665466, 150641.90467351, 174286.43806805,
        61331.35107757, 141235.36699015, 107465.75199828,  62797.67094735,
        48182.46174717,  60522.43052505, 100517.90864918,  98255.7799512 ,
        86069.00098425,  

In [10]:
y_pred = model_pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = numpy.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"R2 Score : {r2:.4f}")
print(f"RMSE     : {rmse:.4f}")
print(f"MAE      : {mae:.4f}")


R2 Score : 0.8622
RMSE     : 18267.9471
MAE      : 11830.8964


In [11]:
scores = cross_val_score(model_pipeline, data, y, cv=5)
print("Mean CV Score:", scores.mean())


Mean CV Score: 0.861616668901806


In [12]:
# Save the pipeline
joblib.dump(model_pipeline, "salary_model_pipeline.joblib")
print("Model saved successfully!")


Model saved successfully!
