<a href="https://colab.research.google.com/github/rudraymehra/RESS-Attrition-Prediction/blob/main/RESS_Attrition_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
##  RESS builds a transparent attrition-prediction system by combining HR datasets with
## market intelligence (reviews, salaries, hiring trends), surfaces key drivers via SHAP,
## and maps them to practical retention recommendations through a Streamlit dashboard and API.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os

BASE = "/content/drive/MyDrive/RESS-Attrition-Prediction"
folders = [
    "data/raw", "data/processed", "data/external",
    "notebooks", "src", "models", "reports/figures"
]
for f in folders:
    os.makedirs(os.path.join(BASE, f), exist_ok=True)

print("Created:", BASE)

Mounted at /content/drive
Created: /content/drive/MyDrive/RESS-Attrition-Prediction


In [2]:
from google.colab import files
import os, shutil

BASE = "/content/drive/MyDrive/RESS-Attrition-Prediction"
RAW  = os.path.join(BASE, "data", "raw")
os.makedirs(RAW, exist_ok=True)

uploaded = files.upload()
for src in uploaded.keys():
    dst = os.path.join(RAW, os.path.basename(src))
    shutil.move(src, dst)
    print("Saved:", dst)

# verify
import glob
print("Now in RAW:", glob.glob(os.path.join(RAW, "*.csv")))
#files.upload used for uploading the data set.

Saving sample_submission.csv to sample_submission.csv
Saving test_data.csv to test_data.csv
Saving train_data.csv to train_data.csv
Saved: /content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/sample_submission.csv
Saved: /content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/test_data.csv
Saved: /content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/train_data.csv
Now in RAW: ['/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/hr_attrition.csv', '/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/sample_submission.csv', '/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/test_data.csv', '/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/train_data.csv']


In [7]:
import os, glob, shutil

BASE = "/content/drive/MyDrive/RESS-Attrition-Prediction"
RAW  = os.path.join(BASE, "data", "raw")

def rename_if_needed(pattern, target_name):
    hits = sorted(glob.glob(os.path.join(RAW, pattern)))
    if not hits:
        print(f"Not found: {pattern}")
        return
    src = hits[-1]  # pick the last (usually latest)
    dst = os.path.join(RAW, target_name)
    if os.path.abspath(src) != os.path.abspath(dst):
        shutil.move(src, dst)
        print(f"Renamed {os.path.basename(src)} -> {target_name}")
    else:
        print(f"{target_name} already set")

rename_if_needed("train_data*.csv", "train_data.csv")
rename_if_needed("test_data*.csv",  "test_data.csv")
rename_if_needed("sample_submission*.csv", "sample_submission.csv")

print("Now in RAW:", glob.glob(os.path.join(RAW, "*.csv")))


#.  to have consistent filenames for training, testing and submission.
#.     even if they are uploaded with slightly diffrent name.

train_data.csv already set
test_data.csv already set
sample_submission.csv already set
Now in RAW: ['/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/hr_attrition.csv', '/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/sample_submission.csv', '/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/test_data.csv', '/content/drive/MyDrive/RESS-Attrition-Prediction/data/raw/train_data.csv']


In [10]:
import pandas as pd, os, numpy as np

BASE = "/content/drive/MyDrive/RESS-Attrition-Prediction"
RAW  = os.path.join(BASE, "data", "raw")
train_path = os.path.join(RAW, "train_data.csv")
test_path  = os.path.join(RAW, "test_data.csv")
sub_path   = os.path.join(RAW, "sample_submission.csv")

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
sub   = pd.read_csv(sub_path)

print("train:", train.shape)
print("test:",  test.shape)
print("sub:",   sub.shape)
print("train cols:", train.columns.tolist()[:25])
print("test  cols:",  test.columns.tolist()[:25])
print("sub   cols:",  sub.columns.tolist())

# try to detect target + id
possible_targets = ["Attrition","attrition","Left","left","Status","status","target","Target","label","Label"]
target_col = next((c for c in possible_targets if c in train.columns), None)

# ID is the column common to test and sample_submission
id_candidates = [c for c in sub.columns if c in test.columns]
id_col = id_candidates[0] if id_candidates else None

print("Detected target:", target_col)
print("Detected ID:", id_col)

#    setting up the paths, loading the data and then printing the data info
#     and detects the the target column by checking the first match of
#         column in sub and test. and then prints target and id


train: (19104, 13)
test: (741, 1)
sub: (741, 2)
train cols: ['MMM-YY', 'Emp_ID', 'Age', 'Gender', 'City', 'Education_Level', 'Salary', 'Dateofjoining', 'LastWorkingDate', 'Joining Designation', 'Designation', 'Total Business Value', 'Quarterly Rating']
test  cols: ['Emp_ID']
sub   cols: ['Emp_ID', 'Target']
Detected target: None
Detected ID: Emp_ID


In [12]:
import os, pandas as pd, numpy as np
from datetime import datetime

# paths
BASE = "/content/drive/MyDrive/RESS-Attrition-Prediction"
RAW  = os.path.join(BASE, "data", "raw")
train_path = os.path.join(RAW, "train_data.csv")
test_path  = os.path.join(RAW, "test_data.csv")
sub_path   = os.path.join(RAW, "sample_submission.csv")

# load
train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
sub   = pd.read_csv(sub_path)

# --- build label: 1 if LastWorkingDate present, else 0 (train only) ---
y = (pd.to_datetime(train["LastWorkingDate"], errors="coerce").notna()).astype(int)

# --- engineer tenure (months) at snapshot MMM-YY (no leakage) ---
def parse_mmm_yy(x):
    try:
        return datetime.strptime(str(x), "%b-%y")  # e.g., 'Jan-14'
    except Exception:
        return pd.NaT

snap_dt = train["MMM-YY"].apply(parse_mmm_yy)
doj_dt  = pd.to_datetime(train["Dateofjoining"], errors="coerce")

def tenure_months(snap, doj):
    if pd.isna(snap) or pd.isna(doj): return np.nan
    return (snap.year - doj.year)*12 + (snap.month - doj.month)

train["tenure_months_snap"] = [tenure_months(s, d) for s, d in zip(snap_dt, doj_dt)]

# --- choose features (drop IDs + raw dates + leakage) ---
drop_cols = ["Emp_ID", "MMM-YY", "Dateofjoining", "LastWorkingDate"]
feat_cols = [c for c in train.columns if c not in drop_cols] + ["tenure_months_snap"]

# raw feature tables (no encoding yet)
X_train_raw = train[feat_cols].copy()

# test has only Emp_ID; create same columns (filled NaN for now)
X_test_raw = pd.DataFrame(columns=feat_cols, index=range(len(test)))

print("y shape:", y.shape, "| positives (left=1):", int(y.sum()))
print("X_train_raw:", X_train_raw.shape, "| X_test_raw:", X_test_raw.shape)
print("First 8 feature columns:", feat_cols[:8])
X_train_raw.head(3)



#        after setting the path and loading the data, the pipeline for churn or
#     attrition. defining the target and creating relevant features like tenure and
#                         selecting initial features used for modeling

y shape: (19104,) | positives (left=1): 1616
X_train_raw: (19104, 11) | X_test_raw: (741, 11)
First 8 feature columns: ['Age', 'Gender', 'City', 'Education_Level', 'Salary', 'Joining Designation', 'Designation', 'Total Business Value']


Unnamed: 0,Age,Gender,City,Education_Level,Salary,Joining Designation,Designation,Total Business Value,Quarterly Rating,tenure_months_snap,tenure_months_snap.1
0,28,Male,C23,Master,57387,1,1,2381060,2,,
1,28,Male,C23,Master,57387,1,1,-665480,2,,
2,28,Male,C23,Master,57387,1,1,0,2,,
