In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from google.colab import drive
import os

drive.mount('/content/drive')
print("Drive mounted:", Path("/content/drive/MyDrive").exists())
print(os.listdir("/content/drive/MyDrive")[:20])

# Optional override (set a path string if needed)
path_override = '/content/drive/MyDrive/google_colab/WA_Fn-UseC_-HR-Employee-Attrition.csv'  # e.g. '/content/drive/MyDrive/ibm_hr.csv'

candidate_paths = [
    'WA_Fn-UseC_-HR-Employee-Attrition.csv',
    'data/WA_Fn-UseC_-HR-Employee-Attrition.csv',
    '/content/WA_Fn-UseC_-HR-Employee-Attrition.csv',
]

path = path_override or next((p for p in candidate_paths if Path(p).exists()), None)
if path is None:
    raise FileNotFoundError('Dataset not found. Download the CSV from Kaggle and place it next to the notebook or in data/, or set path_override.')

df = pd.read_csv(path)

# EX01-EXTRA — IBM HR Data Analysis (Pandas)

**Objective:** practice Pandas on a real dataset and produce meaningful insights.
**Dataset:** IBM HR Analytics Employee Attrition & Performance.

**Note:** if the file is not found, download it from Kaggle and place it next to the notebook.


# Import


In [1]:
# Imports and data loading are in the first cell.


## 1. Load data
Place the CSV next to the notebook or set the path manually.


In [2]:
# Data is loaded in the first cell.

## 2. Initial inspection
Check shape, dtypes, and missing values.


In [3]:
df.shape


(1470, 35)

In [4]:
df.dtypes


Age                         int64
Attrition                     str
BusinessTravel                str
DailyRate                   int64
Department                    str
DistanceFromHome            int64
Education                   int64
EducationField                str
EmployeeCount               int64
EmployeeNumber              int64
EnvironmentSatisfaction     int64
Gender                        str
HourlyRate                  int64
JobInvolvement              int64
JobLevel                    int64
JobRole                       str
JobSatisfaction             int64
MaritalStatus                 str
MonthlyIncome               int64
MonthlyRate                 int64
NumCompaniesWorked          int64
Over18                        str
OverTime                      str
PercentSalaryHike           int64
PerformanceRating           int64
RelationshipSatisfaction    int64
StandardHours               int64
StockOptionLevel            int64
TotalWorkingYears           int64
TrainingTimesL

In [5]:
df.isna().sum().sort_values(ascending=False).head(20)


Age                        0
Attrition                  0
BusinessTravel             0
DailyRate                  0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeNumber             0
EnvironmentSatisfaction    0
Gender                     0
HourlyRate                 0
JobInvolvement             0
JobLevel                   0
JobRole                    0
JobSatisfaction            0
MaritalStatus              0
MonthlyIncome              0
MonthlyRate                0
dtype: int64

Problem type: **binary classification**.\nSuggested target: **Attrition** (left vs stayed).\n

## 3. Data preparation
Split features, drop constants, create a derived feature.


In [6]:
num_cols = df.select_dtypes(include='number').columns.tolist()
cat_cols = df.select_dtypes(exclude='number').columns.tolist()
num_cols, cat_cols


(['Age',
  'DailyRate',
  'DistanceFromHome',
  'Education',
  'EmployeeCount',
  'EmployeeNumber',
  'EnvironmentSatisfaction',
  'HourlyRate',
  'JobInvolvement',
  'JobLevel',
  'JobSatisfaction',
  'MonthlyIncome',
  'MonthlyRate',
  'NumCompaniesWorked',
  'PercentSalaryHike',
  'PerformanceRating',
  'RelationshipSatisfaction',
  'StandardHours',
  'StockOptionLevel',
  'TotalWorkingYears',
  'TrainingTimesLastYear',
  'WorkLifeBalance',
  'YearsAtCompany',
  'YearsInCurrentRole',
  'YearsSinceLastPromotion',
  'YearsWithCurrManager'],
 ['Attrition',
  'BusinessTravel',
  'Department',
  'EducationField',
  'Gender',
  'JobRole',
  'MaritalStatus',
  'Over18',
  'OverTime'])

In [7]:
# Drop constant columns
constant_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
df_prep = df.drop(columns=constant_cols)
constant_cols


['EmployeeCount', 'Over18', 'StandardHours']

In [8]:
# Derived feature: tenure group (fallback to age if YearsAtCompany is missing)
if 'YearsAtCompany' in df_prep.columns:
    df_prep['TenureGroup'] = pd.cut(
        df_prep['YearsAtCompany'],
        bins=[-1, 2, 5, 10, 20, np.inf],
        labels=['0-2', '3-5', '6-10', '11-20', '20+']
    )
elif 'Age' in df_prep.columns:
    df_prep['AgeGroup'] = pd.cut(
        df_prep['Age'],
        bins=[0, 24, 34, 44, 54, np.inf],
        labels=['<25', '25-34', '35-44', '45-54', '55+']
    )


## 4. Data analysis
Answer 3+ questions from the task (groupby, aggregates, filtering).


In [9]:
# Attrition vs income and satisfaction (if columns exist)
if 'Attrition' in df_prep.columns:
    cols = [c for c in ['MonthlyIncome', 'JobSatisfaction'] if c in df_prep.columns]
    if cols:
        display(df_prep.groupby('Attrition')[cols].agg(['mean', 'median', 'count']))


Unnamed: 0_level_0,MonthlyIncome,MonthlyIncome,MonthlyIncome,JobSatisfaction,JobSatisfaction,JobSatisfaction
Unnamed: 0_level_1,mean,median,count,mean,median,count
Attrition,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
No,6832.739659,5204.0,1233,2.778589,3.0,1233
Yes,4787.092827,3202.0,237,2.468354,3.0,237


**Question 1:** Do employees who left differ in salary or satisfaction?

**Answer:** Yes. In the table above, the Attrition=Yes group has lower mean/median MonthlyIncome and slightly lower JobSatisfaction, so leavers are paid less on average and are a bit less satisfied.

In [10]:
# Department/Role attrition rate
if 'Attrition' in df_prep.columns:
    if 'Department' in df_prep.columns:
        dept_rate = (
            df_prep.groupby('Department')['Attrition']
            .apply(lambda s: (s == 'Yes').mean())
            .sort_values(ascending=False)
        )
        display(dept_rate)
    if 'JobRole' in df_prep.columns:
        role_rate = (
            df_prep.groupby('JobRole')['Attrition']
            .apply(lambda s: (s == 'Yes').mean())
            .sort_values(ascending=False)
        )
        display(role_rate.head(10))


Department
Sales                     0.206278
Human Resources           0.190476
Research & Development    0.138398
Name: Attrition, dtype: float64

JobRole
Sales Representative         0.397590
Laboratory Technician        0.239382
Human Resources              0.230769
Sales Executive              0.174847
Research Scientist           0.160959
Manufacturing Director       0.068966
Healthcare Representative    0.068702
Manager                      0.049020
Research Director            0.025000
Name: Attrition, dtype: float64

**Question 2:** Which departments or roles have the highest attrition share?

**Answer:** Check the sorted tables above — the top rows show the maximum attrition rates. In this dataset, the highest rates are at the top of the Department and JobRole lists.

In [11]:
# OverTime vs Attrition
if {'OverTime', 'Attrition'}.issubset(df_prep.columns):
    overtime_rate = (
        df_prep.groupby('OverTime')['Attrition']
        .apply(lambda s: (s == 'Yes').mean())
        .sort_values(ascending=False)
    )
    display(overtime_rate)


OverTime
Yes    0.305288
No     0.104364
Name: Attrition, dtype: float64

**Question 3:** Is OverTime related to attrition?

**Answer:** Yes. The table above shows a much higher attrition rate for OverTime=Yes than OverTime=No, indicating a strong positive association.

In [12]:
# Tenure group vs income
if 'TenureGroup' in df_prep.columns and 'MonthlyIncome' in df_prep.columns:
    display(df_prep.groupby('TenureGroup')['MonthlyIncome'].agg(['mean', 'median', 'count']))


Unnamed: 0_level_0,mean,median,count
TenureGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-2,4709.336257,2918.5,342
3-5,5364.891705,4228.5,434
6-10,6567.370536,5667.0,448
11-20,9009.138889,7969.0,180
20+,16007.969697,17121.0,66


**Question 4:** How are tenure and salary related?

**Answer:** The aggregated table shows that mean/median MonthlyIncome increases as TenureGroup grows. This suggests a positive relationship between tenure and salary.

In [None]:
# Attrition rate by tenure/age group (filtered by group size)
if 'Attrition' in df_prep.columns:
    group_col = None
    if 'TenureGroup' in df_prep.columns:
        group_col = 'TenureGroup'
    elif 'AgeGroup' in df_prep.columns:
        group_col = 'AgeGroup'

    if group_col:
        attrition_by_group = (
            df_prep.assign(AttritionFlag=df_prep['Attrition'].eq('Yes'))
            .groupby(group_col)['AttritionFlag']
            .agg(['mean', 'count'])
            .rename(columns={'mean': 'attrition_rate'})
            .sort_values('attrition_rate', ascending=False)
        )
        # Filter out tiny groups to avoid noisy rates
        display(attrition_by_group[attrition_by_group['count'] >= 30])


**Optional:** Attrition share by tenure/age group

**Answer:** The table above shows attrition rate by tenure (or age if tenure is unavailable). It is sorted by attrition_rate, so the highest groups are at the top. Very small groups are filtered out (count < 30) to reduce noise.

In [None]:
# Median satisfaction by department
if {'Department', 'JobSatisfaction'}.issubset(df_prep.columns):
    dept_satisfaction = (
        df_prep.groupby('Department')['JobSatisfaction']
        .agg(['median', 'mean', 'count'])
        .sort_values('median', ascending=False)
    )
    display(dept_satisfaction)


**Optional:** Does median satisfaction differ across departments?

**Answer:** Yes, medians vary across departments. The table above is sorted by median JobSatisfaction, so departments with higher median satisfaction appear at the top.

In [None]:
# Features with strongest differences between Attrition groups
if 'Attrition' in df_prep.columns:
    num_cols = df_prep.select_dtypes(include='number').columns
    if len(num_cols) > 0 and {'Yes', 'No'}.issubset(set(df_prep['Attrition'].dropna().unique())):
        stats = df_prep.groupby('Attrition')[num_cols].agg(['mean', 'median'])
        mean_diff = (stats.xs('mean', level=1, axis=1).loc['Yes'] -
                     stats.xs('mean', level=1, axis=1).loc['No']).abs()
        median_diff = (stats.xs('median', level=1, axis=1).loc['Yes'] -
                       stats.xs('median', level=1, axis=1).loc['No']).abs()
        diff_df = (
            pd.DataFrame({'mean_diff': mean_diff, 'median_diff': median_diff})
            .sort_values('median_diff', ascending=False)
        )
        display(diff_df.head(10))


**Optional:** Features whose distributions differ the most between leavers and stayers

**Answer:** The table above lists the top‑10 features with the largest mean/median differences between Attrition=Yes and Attrition=No. These features show the strongest distributional separation and are good candidates for modeling.

## 5. Summary
- Key observations:
  1) Leavers have lower income and slightly lower satisfaction (see Attrition table).
  2) OverTime is strongly associated with higher attrition.
  3) Salary increases with tenure; attrition varies by department/role.
- Most informative features: OverTime, MonthlyIncome, YearsAtCompany, JobRole/Department, JobSatisfaction.
- 3 features for a model and rationale:
  - OverTime — largest difference in attrition rate between groups.
  - MonthlyIncome — clear mean/median gap for Attrition=Yes vs No.
  - YearsAtCompany (TenureGroup) — strong tenure gradient and link to pay/attrition.
- ML task: binary classification (predict Attrition Yes/No).
