In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv('HR-Employee-Attrition.csv')

# Separate features (X) and target variable (y)
X = data.drop('DailyRate', axis=1)
y = data['DailyRate']

# Convert categorical variables to numerical using LabelEncoder
label_encoder = LabelEncoder()
X_encoded = X.copy()
for col in X_encoded.columns:
    if X_encoded[col].dtype == 'object':
        X_encoded[col] = label_encoder.fit_transform(X_encoded[col])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize models (excluding SVR)
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Linear Regression": LinearRegression(),
}

# Train, evaluate models, and predict house prices
prediction_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    prediction_results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2, "Predictions": y_pred}

    print(f"Metrics for {name}:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R2): {r2}")
    print("=" * 50)

# Find the best performing model based on the R-squared score
best_model = max(prediction_results, key=lambda x: prediction_results[x]['R2'])
print(f"Best performing model based on R-squared score: {best_model}")

# Predict house prices using the best model
print(f"\nPredictions using the best model ({best_model}):")
print(prediction_results[best_model]["Predictions"])


Metrics for Decision Tree:
Mean Absolute Error (MAE): 462.8537414965986
Root Mean Squared Error (RMSE): 568.2768589652179
R-squared (R2): -1.077351773166725
Metrics for Random Forest:
Mean Absolute Error (MAE): 347.1185714285715
Root Mean Squared Error (RMSE): 405.33035444942345
R-squared (R2): -0.05683785112861606
Metrics for Linear Regression:
Mean Absolute Error (MAE): 343.4192649583252
Root Mean Squared Error (RMSE): 396.993085642734
R-squared (R2): -0.013808639723214
Best performing model based on R-squared score: Linear Regression

Predictions using the best model (Linear Regression):
[779.56457208 833.4546282  677.12761036 860.50677385 788.20230684
 891.00695243 703.40334583 885.49548697 738.13314043 882.38547954
 754.31090083 574.61850169 831.23663788 804.78947537 655.24394162
 744.2110661  788.34993973 600.1464748  744.33884006 842.66735409
 720.72158363 896.4621577  815.37769343 828.37751279 792.12290391
 687.12546452 755.35423297 835.88890795 683.4297613  724.93349959
 769.1