In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout

# Read the dataset
data_subset = pd.read_csv('/content/IBM-HR-Employee-Attrition.csv')

# Map the 'Attrition' column to numeric labels
data_subset['Attrition'] = data_subset['Attrition'].map({'Yes': 1, 'No': 0})

# Selecting features for time frame prediction
X_timeframe = data_subset[['Age', 'DistanceFromHome', 'Education',
                           'JobSatisfaction', 'MonthlyIncome', 'PercentSalaryHike',
                           'YearsSinceLastPromotion', 'PerformanceRating', 'Attrition',
                           'EnvironmentSatisfaction', 'NumCompaniesWorked','TotalWorkingYears']]

# Target variable for time frame prediction
y_timeframe = data_subset['YearsAtCompany']

# Split data into training and testing sets for time frame prediction
X_train_timeframe, X_test_timeframe, y_train_timeframe, y_test_timeframe = train_test_split(X_timeframe, y_timeframe, test_size=0.2, random_state=42)

# Data scaling for time frame prediction
scaler_timeframe = StandardScaler()
X_train_timeframe_scaled = scaler_timeframe.fit_transform(X_train_timeframe)
X_test_timeframe_scaled = scaler_timeframe.transform(X_test_timeframe)

# Define the neural network model for time frame prediction
def create_model():
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train_timeframe_scaled.shape[1],)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='linear')
    ])
    return model

# Train multiple models and average their predictions
num_models = 5
models = []
for i in range(num_models):
    model = create_model()
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse', 'accuracy'])
    history = model.fit(X_train_timeframe_scaled, y_train_timeframe, epochs=150, batch_size=64, verbose=0, validation_split=0.2)
    models.append(model)


In [None]:
# Evaluate ensemble
predictions = np.zeros_like(y_test_timeframe, dtype=np.float64)  # Ensure predictions is of type float64
for model in models:
    model_predictions = model.predict(X_test_timeframe_scaled)
    predictions += np.squeeze(model_predictions)  # Ensure model predictions are squeezed to remove any extra dimensions
predictions /= num_models
ensemble_loss = mean_squared_error(y_test_timeframe, predictions)
ensemble_mae = mean_absolute_error(y_test_timeframe, predictions)
ensemble_rmse = np.sqrt(mean_squared_error(y_test_timeframe, predictions))
ensemble_r2 = r2_score(y_test_timeframe, predictions)
print(f'Ensemble: Time Frame Prediction Loss: {ensemble_loss}, MAE: {ensemble_mae}, RMSE: {ensemble_rmse}, R-squared: {ensemble_r2}')

Ensemble: Time Frame Prediction Loss: 9.024933729701191, MAE: 1.901426244471349, RMSE: 3.004152747398373, R-squared: 0.7701212157878716


Time Frame Prediction Loss: The average loss or error in predicting time frames is approximately 9.02 out of 100

Mean Absolute Error (MAE): The average absolute error between predicted and actual time frames is approximately 1.90. MAE represents the average magnitude of errors in the predictions.

Root Mean Squared Error (RMSE): The square root of the average squared differences between predicted and actual time frames is approximately 3.00

R-squared (R²): The model explains approximately 77% of the variability in the time frames

Overall, the ensemble model shows reasonable performance in predicting time frames, with low prediction loss, MAE, and RMSE, and a moderate to high R-squared value.

In [None]:
best_model = None
best_mse = float('inf')  # Initialize with a high value

for model in models:
    mse = model.evaluate(X_test_timeframe_scaled, y_test_timeframe, verbose=0)[2]  # Index 2 is for MSE
    if mse < best_mse:
        best_model = model
        best_mse = mse

print(f'Best Model MSE: {best_mse}')

# Save the best model
best_model.save('best_model_timeframe.keras')

Best Model MSE: 8.985008239746094
