# Employee Attrition Prediction

Ye notebook **Sales_Forecasting_Dashboard.ipynb** ki style follow karta hua Employee Attrition Prediction pipeline aur simple Streamlit prediction app provide karta hai.

**Usage:** Notebook mein diye gaye `DATA_PATH` ko apne HR dataset (e.g. `HR_Employee_Attrition.csv`) ke path se replace karein.

In [None]:
# Imports
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import joblib

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')


## 1) Data Load & Quick Look

- Default filename: `HR_Employee_Attrition.csv`
- Agar aapka file name alag hai, to `DATA_PATH` change karein.


In [None]:
# Set dataset path
DATA_PATH = 'HR_Employee_Attrition.csv'

# Load
if os.path.exists(DATA_PATH):
    df = pd.read_csv(DATA_PATH)
    print('Dataset loaded:', DATA_PATH)
else:
    df = pd.DataFrame()
    print(f"File not found: {DATA_PATH}. Please upload the HR dataset in the notebook folder.")

# Quick look
if not df.empty:
    display(df.head())
    print('\nShape:', df.shape)
    print('\nColumns:', df.columns.tolist())


## 2) Exploratory Data Analysis (EDA)
- Attrition distribution, key feature plots


In [None]:
if not df.empty:
    # Attrition distribution
    plt.figure(figsize=(6,4))
    sns.countplot(x='Attrition', data=df)
    plt.title('Attrition Distribution')
    plt.show()

    # Numeric summary
    display(df.describe(include=[np.number]).T)

    # Categorical overview (top categorical columns)
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    print('Categorical columns:', cat_cols)
    for c in cat_cols[:6]:
        plt.figure(figsize=(6,3))
        sns.countplot(y=c, data=df, order=df[c].value_counts().index)
        plt.title(c)
        plt.show()
else:
    print('No data to show. Load your dataset first.')


## 3) Preprocessing & Feature Engineering
- Simple pipeline: impute, encode categoricals, scale numerics


In [None]:
# Example preprocessing pipeline
if not df.empty:
    # target
    target = 'Attrition'  # assumes values 'Yes'/'No'
    X = df.drop(columns=[target])
    y = df[target].map({'Yes':1, 'No':0})

    # identify column types
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()

    print('Numerical cols:', num_cols)
    print('Categorical cols:', cat_cols)

    # Pipelines
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])

    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print('Train/Test shapes:', X_train.shape, X_test.shape)
else:
    print('No data to preprocess. Load your dataset first.')


## 4) Modeling
- Train Logistic Regression, Random Forest, XGBoost
- Evaluate with accuracy, precision, recall, ROC-AUC


In [None]:
if not df.empty:
    # Example: Random Forest pipeline
    rf_pipe = Pipeline([
        ('pre', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
    ])

    rf_pipe.fit(X_train, y_train)
    y_pred = rf_pipe.predict(X_test)
    y_proba = rf_pipe.predict_proba(X_test)[:,1]

    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('F1:', f1_score(y_test, y_pred))
    print('ROC AUC:', roc_auc_score(y_test, y_proba))

    print('\nClassification Report:\n', classification_report(y_test, y_pred))

    # Save model
    model_path = 'attrition_model_rf.pkl'
    joblib.dump(rf_pipe, model_path)
    print('Saved model to', model_path)
else:
    print('No data to model. Load your dataset first.')


### Feature importance (approximate)
- Since pipeline includes one-hot encoding, extracting importance needs mapping. This cell provides a simple approach for numeric features only.


In [None]:
if not df.empty:
    try:
        # Attempt to get feature importances for RandomForest
        pre = rf_pipe.named_steps['pre']
        clf = rf_pipe.named_steps['clf']
        # This will only work if OneHotEncoder produced feature names (sklearn>=1.0)
        try:
            num_cols = pre.transformers_[0][2]
            cat_cols = pre.transformers_[1][1].named_steps['onehot'].get_feature_names_out(pre.transformers_[1][2])
            feature_names = list(num_cols) + list(cat_cols)
        except Exception:
            feature_names = None

        importances = clf.feature_importances_
        if feature_names is not None and len(feature_names)==len(importances):
            imp_df = pd.DataFrame({'feature':feature_names, 'importance':importances}).sort_values('importance', ascending=False).head(20)
            display(imp_df)
            plt.figure(figsize=(8,6))
            sns.barplot(x='importance', y='feature', data=imp_df)
            plt.title('Top feature importances')
            plt.show()
        else:
            print('Could not reliably map feature names to importances. Showing top numeric importances only.')
            imp_df = pd.Series(importances[:len(num_cols)], index=num_cols).sort_values(ascending=False)
            display(imp_df)
    except Exception as e:
        print('Error computing importances:', e)
else:
    print('No data loaded.')


## 5) Predict on a single example
- Provide a dictionary with employee features and predict attrition probability.


In [None]:
# Example single-prediction (edit the values to match your dataset columns)
if not df.empty:
    sample = X_test.iloc[0:1].copy()
    display(sample)
    loaded = joblib.load('attrition_model_rf.pkl')
    prob = loaded.predict_proba(sample)[:,1]
    pred = loaded.predict(sample)
    print('Predicted probability of attrition:', prob[0])
    print('Predicted class (1=Attrition):', pred[0])
else:
    print('No data available.')


## 6) Optional: Streamlit app (save as `streamlit_app.py` and run `streamlit run streamlit_app.py`)

Below code can be copied to a separate `streamlit_app.py` file to provide a simple web UI for predictions.


In [None]:
streamlit_code = r"""
import streamlit as st
import joblib
import pandas as pd

st.title('Employee Attrition Predictor')

# Load model
model = joblib.load('attrition_model_rf.pkl')

st.sidebar.header('Employee input')
# NOTE: Edit the inputs below to match your dataset features
age = st.sidebar.number_input('Age', min_value=18, max_value=80, value=35)
monthly_income = st.sidebar.number_input('Monthly Income', min_value=1000, max_value=50000, value=5000)
job_role = st.sidebar.selectbox('Job Role', ['Sales Executive','Research Scientist','Laboratory Technician','Manufacturing Director','Healthcare Representative','Manager','Sales Representative','Research Director','Human Resources'])

over_time = st.sidebar.selectbox('OverTime', ['Yes','No'])

overlay_df = pd.DataFrame({
    'Age':[age],
    'MonthlyIncome':[monthly_income],
    'JobRole':[job_role],
    'OverTime':[over_time]
})

if st.button('Predict'):
    proba = model.predict_proba(overlay_df)[:,1]
    st.write(f'Probability of attrition: {proba[0]:.3f}')
"""

# save to file
with open('streamlit_app.py', 'w') as f:
    f.write(streamlit_code)
print('Saved Streamlit app to streamlit_app.py')


### Notebook & Streamlit file saved
- Notebook available as `/mnt/data/Employee_Attrition_Prediction.ipynb`
- Streamlit app saved as `/mnt/data/streamlit_app.py`