In [1]:

# Employee Salary Prediction Project - Jupyter Notebook

# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib



In [2]:
# Step 2: Load and Explore Data
df = pd.read_excel("employee_data.xlsx")
print(df.head())
print(df.info())
print(df.describe())

FileNotFoundError: [Errno 2] No such file or directory: 'employee_data.xlsx'

In [None]:
# Step 3: Handle Missing and Unknown Values
df.dropna(inplace=True)

In [None]:
# Step 4: Explore and Remove Outliers
sns.boxplot(data=df[['Years', 'Job Rate', 'Sick Leaves', 'Unpaid Leaves', 'Annual Salary']])
plt.title("Outlier Detection")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Step 5: Feature Cleanup
X = df.drop("Annual Salary", axis=1)
y = df["Annual Salary"]

In [None]:
# Step 6: Label Encoding for Department
le = LabelEncoder()
X['Department'] = le.fit_transform(X['Department'])


In [None]:
# Step 7: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train Model (SVM)
svm_pipeline = Pipeline([
    ('svm', SVR(kernel='rbf'))
])
svm_pipeline.fit(X_train, y_train)

In [None]:
# Step 9: Evaluate Model
y_pred = svm_pipeline.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

In [None]:
# Step 10: Save Model and Label Encoder
joblib.dump(svm_pipeline, "svm_salary_model.joblib")
joblib.dump(le, "label_encoder.joblib")

# Visualizations
sns.pairplot(df, hue="Department")
plt.show()

sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()