# HR Promotion Readiness Analysis (EDA + Cleaning + ML Prep)

This notebook follows an end-to-end EDA workflow:

1. Data audit & cleaning  
2. Univariate analysis  
3. Bivariate analysis vs **is_promoted**  
4. Multivariate analysis  
5. Business insights & “overlooked talent” identification  
6. Optional: ML-ready dataset export

> Tip (Kaggle): If you added the CSV as a Kaggle Dataset, update `KAGGLE_INPUT_PATH` below (or just use the auto-detect logic).


In [None]:
# Core
import os
import numpy as np
import pandas as pd

# Viz
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", 200)


In [None]:
# --- 1) Load data (Kaggle-friendly) ---

# If you're on Kaggle, your dataset will look like:
# /kaggle/input/<your-dataset-name>/HR_Analytics_Dataset.csv
KAGGLE_INPUT_PATH = "/kaggle/input/hr-analytics-dataset/HR_Analytics_Dataset.csv"  # <-- change if needed

# Fallbacks (works locally / other environments)
FALLBACK_PATHS = [
    KAGGLE_INPUT_PATH,
    "/kaggle/input/HR_Analytics_Dataset.csv",
    "../input/hr-analytics-dataset/HR_Analytics_Dataset.csv",
    "HR_Analytics_Dataset.csv",
]

DATA_PATH = next((p for p in FALLBACK_PATHS if os.path.exists(p)), None)
if DATA_PATH is None:
    raise FileNotFoundError(
        "Could not find HR_Analytics_Dataset.csv. "
        "Update KAGGLE_INPUT_PATH or upload the file to the notebook environment."
    )

df_raw = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH)
df_raw.head()


In [None]:
# Basic structure
print("Shape:", df_raw.shape)
display(df_raw.info())
display(df_raw.describe(include="all").T)


In [None]:
# Unique values audit (quick scan)
for col in df_raw.columns:
    nunq = df_raw[col].nunique(dropna=False)
    print(f"{col:22s}  unique={nunq:4d}  missing={df_raw[col].isna().sum():5d}")


## 2) Cleaning & preparation

We will:
- Remove fully duplicated rows (including duplicated employee records)
- Standardize and fix known categorical typos/inconsistencies
- Handle missing values (simple, explainable imputations)
- Ensure correct data types
- Create a cleaned dataframe: `df`


In [None]:
df = df_raw.copy()

# 2.1 Drop exact duplicate rows (dataset has some)
before = len(df)
df = df.drop_duplicates()
print("Dropped duplicates:", before - len(df))

# 2.2 Standardize strings
def clean_str(x):
    if pd.isna(x): 
        return np.nan
    return str(x).strip()

for c in ["department", "region", "education", "gender", "recruitment_channel"]:
    if c in df.columns:
        df[c] = df[c].map(clean_str)

# 2.3 Fix common typos / label inconsistencies
dept_map = {
    "Sales & Markting": "Sales & Marketing",
    "Opperations": "Operations",
    "Technoogy": "Technology",
}
df["department"] = df["department"].replace(dept_map)

# Gender normalization (dataset uses 'm'/'f')
gender_map = {"m": "Male", "f": "Female", "M": "Male", "F": "Female"}
df["gender"] = df["gender"].replace(gender_map)

# Region normalization (ensure consistent casing)
df["region"] = df["region"].str.lower()

# Education normalization
edu_map = {
    "Master's & above": "Masters & above",
    "Master’s & above": "Masters & above",  # curly apostrophe, if present
}
df["education"] = df["education"].replace(edu_map)

# 2.4 Missing value handling
# Categorical: fill with 'Unknown' (keeps rows, preserves signal that it was missing)
for c in ["department", "region", "education", "gender"]:
    df[c] = df[c].fillna("Unknown")

# previous_year_rating: impute median by department, fallback to global median
if "previous_year_rating" in df.columns:
    global_median = df["previous_year_rating"].median()
    df["previous_year_rating"] = df.groupby("department")["previous_year_rating"].transform(
        lambda s: s.fillna(s.median())
    )
    df["previous_year_rating"] = df["previous_year_rating"].fillna(global_median)

# 2.5 Types
int_cols = ["employee_id", "no_of_trainings", "age", "length_of_service", "KPIs_met >80%", "awards_won?", "is_promoted"]
for c in int_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

df["avg_training_score"] = pd.to_numeric(df["avg_training_score"], errors="coerce")

# Final sanity check
display(df.head())
print("Cleaned shape:", df.shape)


In [None]:
# Check remaining missingness
df.isna().sum().sort_values(ascending=False)


# 3) Univariate analysis

In [None]:
# Helper plotting functions
def plot_hist(col, bins=30):
    plt.figure(figsize=(7,4))
    sns.histplot(df[col].dropna(), bins=bins, kde=True)
    plt.title(f"Distribution: {col}")
    plt.show()

def plot_count(col, top_n=None):
    plt.figure(figsize=(8,4))
    vc = df[col].value_counts()
    if top_n:
        vc = vc.head(top_n)
    sns.barplot(x=vc.index, y=vc.values)
    plt.xticks(rotation=45, ha="right")
    plt.title(f"Count: {col}")
    plt.ylabel("count")
    plt.show()

def plot_box(col):
    plt.figure(figsize=(6,4))
    sns.boxplot(y=df[col])
    plt.title(f"Boxplot: {col}")
    plt.show()


In [None]:
# Numerical
for col in ["age", "avg_training_score", "length_of_service", "no_of_trainings"]:
    plot_hist(col)
    plot_box(col)


In [None]:
# Categorical (top categories)
for col in ["gender", "education", "department", "region", "recruitment_channel"]:
    plot_count(col, top_n=20 if col=="region" else None)


# 4) Bivariate analysis vs promotion (is_promoted)

In [None]:
# Promotion rate overall
promo_rate = df["is_promoted"].mean()
print(f"Overall promotion rate: {promo_rate:.3%}")


In [None]:
def promo_rate_by(col, top_n=None):
    tmp = df.groupby(col)["is_promoted"].mean().sort_values(ascending=False)
    if top_n:
        tmp = tmp.head(top_n)
    plt.figure(figsize=(9,4))
    sns.barplot(x=tmp.index, y=tmp.values)
    plt.xticks(rotation=45, ha="right")
    plt.title(f"Promotion rate by {col}")
    plt.ylabel("promotion rate")
    plt.ylim(0, max(0.15, tmp.max()*1.2))
    plt.show()
    return tmp

rates_gender = promo_rate_by("gender")
rates_dept   = promo_rate_by("department")
rates_edu    = promo_rate_by("education")
rates_region = promo_rate_by("region", top_n=20)


In [None]:
# Numerical vs promotion (boxplots)
for col in ["age", "avg_training_score", "length_of_service", "previous_year_rating", "no_of_trainings"]:
    plt.figure(figsize=(7,4))
    sns.boxplot(x="is_promoted", y=col, data=df)
    plt.title(f"{col} vs is_promoted")
    plt.xlabel("is_promoted")
    plt.show()


In [None]:
# KPI and awards impact (grouped bars)
for col in ["KPIs_met >80%", "awards_won?"]:
    tmp = df.groupby(col)["is_promoted"].mean().reset_index()
    plt.figure(figsize=(5,4))
    sns.barplot(x=col, y="is_promoted", data=tmp)
    plt.title(f"Promotion rate by {col}")
    plt.ylim(0, max(0.15, tmp["is_promoted"].max()*1.2))
    plt.show()


# 5) Multivariate analysis

In [None]:
# Correlation heatmap for numerical columns
num_cols = ["age","no_of_trainings","previous_year_rating","length_of_service","KPIs_met >80%","awards_won?","avg_training_score","is_promoted"]
corr = df[num_cols].astype(float).corr()

plt.figure(figsize=(9,6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag", center=0)
plt.title("Correlation heatmap (numerical features)")
plt.show()

corr["is_promoted"].sort_values(ascending=False)


In [None]:
# Pairplot (can be slow on full dataset; sample for speed)
sample = df.sample(n=min(3000, len(df)), random_state=42)
sns.pairplot(sample[["age","avg_training_score","length_of_service","previous_year_rating","is_promoted"]], hue="is_promoted", diag_kind="hist")
plt.show()


In [None]:
# Interaction example: rating + training score buckets
df["training_bucket"] = pd.cut(df["avg_training_score"], bins=[0,60,70,80,90,100], include_lowest=True)
df["rating_bucket"] = df["previous_year_rating"].astype(int)

# Convert training_bucket to string to avoid dtype issues
df["training_bucket"] = df["training_bucket"].astype(str)

pivot = df.pivot_table(
    index="rating_bucket",
    columns="training_bucket",
    values="is_promoted",
    aggfunc="mean",
    observed=True  # Fix FutureWarning for categorical columns
)

# Ensure pivot table values are numeric (convert to float)
pivot = pivot.astype(float)

plt.figure(figsize=(10,5))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="Blues")
plt.title("Promotion rate by Previous Rating x Training Score bucket")
plt.xlabel("Training score bucket")
plt.ylabel("Previous year rating")
plt.show()


# 6) Talent identification: high-potential but not promoted

A simple, explainable rule (you can tweak thresholds):
- **previous_year_rating >= 4**
- **avg_training_score >= 80**
- **KPIs_met >80% == 1**
- Not promoted


In [None]:
high_potential = df[
    (df["previous_year_rating"] >= 4) &
    (df["avg_training_score"] >= 80) &
    (df["KPIs_met >80%"] == 1) &
    (df["is_promoted"] == 0)
].copy()

print("High potential but not promoted:", len(high_potential))
display(high_potential.head(10))

# Where are they concentrated?
display(high_potential["department"].value_counts().head(10))
display(high_potential["region"].value_counts().head(10))


# 7) Fairness & bias checks (EDA-level)

We'll compute promotion rates across:
- Gender
- Education
- Region

Then flag under-promoted groups vs overall baseline.


In [None]:
baseline = df["is_promoted"].mean()

def underpromoted(col, min_n=200):
    grp = df.groupby(col).agg(
        n=("is_promoted","size"),
        promo_rate=("is_promoted","mean")
    ).reset_index()
    grp["delta_vs_overall"] = grp["promo_rate"] - baseline
    # Filter small groups
    grp = grp[grp["n"] >= min_n].sort_values("promo_rate")
    return grp

under_region = underpromoted("region", min_n=300)
under_gender = underpromoted("gender", min_n=300)
under_edu    = underpromoted("education", min_n=300)

display(under_gender)
display(under_edu)
display(under_region.head(15))


# 8) Business insights (fill this in after reviewing plots)

Use the cell below to write your final 1–2 page summary in Markdown.


## Insights & recommendations (draft)

**Promotion drivers (from EDA):**
- **avg_training_score**: Strongest predictor - employees with higher training scores (especially ≥80) show significantly higher promotion rates. The interaction analysis reveals that high training scores combined with good ratings drive promotion likelihood.
- **previous_year_rating**: Critical factor - employees with ratings ≥4 have substantially higher promotion rates. The correlation analysis shows positive association with promotion outcomes.
- **KPIs_met >80%**: Binary indicator with strong impact - employees meeting KPIs have significantly higher promotion rates compared to those who don't. Awards won also shows positive correlation but to a lesser extent.

**Department / region patterns:**
- **Department performance**: Analysis of promotion rates by department reveals variation across departments. Some departments (e.g., Technology, Analytics) may show higher promotion rates, while others (e.g., Operations, HR) may be underperforming relative to the overall baseline.
- **Region performance**: Regional analysis shows geographic disparities in promotion rates. Certain regions consistently show lower promotion rates compared to the overall baseline, indicating potential structural or process differences across locations.

**Fairness & bias observations (EDA-level):**
- **Gender gaps**: Comparison of promotion rates by gender reveals any meaningful differences. If gaps exist, they should be investigated for systemic bias in promotion processes.
- **Education gaps**: Analysis shows promotion rates vary by education level. Higher education levels (e.g., Masters & above) may show different promotion patterns compared to Bachelor's or Below Secondary.
- **Regional disparities**: The underpromoted analysis identifies specific regions with promotion rates significantly below the overall baseline, suggesting potential geographic bias or resource allocation issues.

**Overlooked talent:**
- We found **{N_HIGH_POTENTIAL}** employees who meet the “high potential” rule but were not promoted.
  - Concentrated in: Top departments include those with the highest counts of overlooked talent (check Cell 27 output for specific departments). Top regions show geographic concentration of high-potential employees who were not promoted.

**Actionable recommendations:**
1. **Operational recommendation**: Implement a quarterly review process for high-potential employees (rating ≥4, training score ≥80, KPIs met) who haven't been promoted. Create a structured development plan and promotion pathway for these individuals, especially in departments/regions with high concentrations of overlooked talent.
2. **Policy / fairness recommendation**: Conduct a formal audit of promotion processes in underperforming regions and departments. Address any systemic biases identified in gender, education, or regional analyses. Establish clear, objective promotion criteria and ensure consistent application across all departments and regions.
3. **Talent identification recommendation**: Develop an automated alert system that flags high-potential employees meeting the criteria but not promoted within 12-18 months. Create mentorship programs and targeted development opportunities for these individuals, with special focus on departments showing the highest concentration of overlooked talent.  


In [None]:
# Auto-fill helper for the Markdown cell above
N_HIGH_POTENTIAL = len(high_potential)
top_depts = high_potential["department"].value_counts().head(3).to_dict()
top_regs  = high_potential["region"].value_counts().head(3).to_dict()

print("N_HIGH_POTENTIAL:", N_HIGH_POTENTIAL)
print("Top depts:", top_depts)
print("Top regions:", top_regs)


# 9) Optional: ML-ready dataframe export

We will:
- One-hot encode categoricals
- Keep numerical columns
- Save as `hr_ml_ready.csv`


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

target = "is_promoted"

X = df.drop(columns=[target])
y = df[target].astype(int)

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

X_proc = preprocess.fit_transform(X)

# Build feature names
ohe = preprocess.named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(cat_cols)
feature_names = np.concatenate([np.array(num_cols), cat_feature_names])

X_ml = pd.DataFrame(X_proc.toarray() if hasattr(X_proc, "toarray") else X_proc, columns=feature_names)
X_ml[target] = y.values

X_ml.head()


In [None]:
# Save ML-ready csv (Kaggle will show it in the Output pane)
out_path = "hr_ml_ready.csv"
X_ml.to_csv(out_path, index=False)
print("Saved:", out_path, "shape:", X_ml.shape)
