In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Optional: Uncomment if you want to use XGBoost
# from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv("6. Predict Employee Attrition.csv")

# Display the shape of the dataset and preview the first few rows
print("Shape of the dataset:", df.shape)
print("\nColumns in the dataset:", df.columns.tolist())
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Check for missing values in each column
print("\nMissing values in each column:\n", df.isnull().sum())

# Encode all categorical (object type) columns using Label Encoding
# This converts string labels to numeric codes
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Save encoders in case you need to decode later

# Define feature matrix (X) and target vector (y)
# Make sure 'Attrition' is the correct column name for your target variable
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (zero mean, unit variance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest classifier and train it on the training set
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict the target for the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model using various metrics
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Plot the feature importance to see which features contribute most to the prediction
importances = model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]  # Sort in descending order

# Create a bar plot of feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=features[indices])
plt.title("Feature Importance from Random Forest")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()
