In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the datasets
train_data = pd.read_csv('hr_train.csv')
test_data = pd.read_csv('hr_test.csv')

# Display the first few rows of the training data
train_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.42,0.46,2,150,3,0,1,0,sales,medium
1,0.66,0.77,2,171,2,0,0,0,technical,medium
2,0.55,0.49,5,240,3,0,0,0,technical,high
3,0.22,0.88,4,213,3,1,0,0,technical,medium
4,0.2,0.72,6,224,4,0,1,0,technical,medium


In [3]:
# Display basic information about the training data
train_data.info()

# Summary statistics
train_data.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10499 entries, 0 to 10498
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     10499 non-null  float64
 1   last_evaluation        10499 non-null  float64
 2   number_project         10499 non-null  int64  
 3   average_montly_hours   10499 non-null  int64  
 4   time_spend_company     10499 non-null  int64  
 5   Work_accident          10499 non-null  int64  
 6   left                   10499 non-null  int64  
 7   promotion_last_5years  10499 non-null  int64  
 8   sales                  10499 non-null  object 
 9   salary                 10499 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 820.4+ KB


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
count,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499,10499
unique,,,,,,,,,10,3
top,,,,,,,,,sales,low
freq,,,,,,,,,2935,5104
mean,0.612683,0.717131,3.808553,201.059815,3.494238,0.144299,0.292885,0.021716,,
std,0.248578,0.171483,1.230572,49.959332,1.453227,0.35141,0.455108,0.145763,,
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,,
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0,,
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0,,
75%,0.82,0.87,5.0,245.0,4.0,0.0,1.0,0.0,,


In [4]:
# Check for missing values
train_data.isnull().sum()
test_data.isnull().sum()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['sales', 'salary']

for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le

# Separate features and target variable from training data
X_train = train_data.drop(columns=['left'])
y_train = train_data['left']

# Features from test data
X_test = test_data

In [5]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Cross-validation to evaluate the model
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='roc_auc')
mean_auc_score = cv_scores.mean()

# Round the mean AUC score to two decimal places
mean_auc_score_rounded = round(mean_auc_score, 2)

print(f'Cross-validated AUC scores: {cv_scores}')
print(f'Mean AUC score: {mean_auc_score_rounded}')

# Train the model on the entire training data
rf_model.fit(X_train, y_train)

Cross-validated AUC scores: [0.82770633 0.82044565 0.87122553 0.8275837  0.83917943]
Mean AUC score: 0.84


In [8]:
# Predict probabilities on the test data
test_predictions = rf_model.predict_proba(X_test)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({'EmployeeID': test_data.index, 'Probability': test_predictions})
submission.to_csv('hr_test_predictions.csv', index=False)

# Verify the submission file
print(submission.head())

   EmployeeID  Probability
0           0     1.000000
1           1     0.500548
2           2     0.498333
3           3     0.985000
4           4     0.966667
