In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv("../data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x="Attrition", data=df)
plt.title("Attrition Distribution")
plt.show()


In [None]:
df["Attrition"].value_counts(normalize=True) * 100


In [None]:
sns.boxplot(x="Attrition", y="MonthlyIncome", data=df)
plt.title("Monthly Income vs Attrition")
plt.show()


In [None]:
sns.boxplot(x="Attrition", y="Age", data=df)
plt.show()


In [None]:
sns.countplot(x="Department", hue="Attrition", data=df)
plt.xticks(rotation=45)
plt.show()


In [None]:
sns.countplot(x="OverTime", hue="Attrition", data=df)
plt.show()


In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), cmap="coolwarm")
plt.show()


In [None]:
# Shape
print("shape:", df.shape)

# Data types
print(df.dtypes.value_counts())

# Unique values check
df.nunique().sort_values()


In [None]:
df.drop(['EmployeeCount','EmployeeNumber','Over18','StandardHours'], 
        axis=1, inplace=True)


In [None]:
num_cols = df.select_dtypes(include=np.number).columns
num_cols


In [None]:
for col in num_cols:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()


In [None]:
def detect_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower) | (data[column] > upper)]
    return len(outliers)

for col in num_cols:
    print(col, ":", detect_outliers(df, col))


In [None]:
def cap_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    data[column] = np.where(data[column] > upper, upper,
                            np.where(data[column] < lower, lower,
                                     data[column]))
    
for col in num_cols:
    cap_outliers(df, col)


In [None]:
df[num_cols].skew().sort_values(ascending=False)


In [None]:
df["MonthlyIncome"] = np.log1p(df["MonthlyIncome"])


In [None]:
corr_matrix = df.corr(numeric_only=True)

plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, cmap="coolwarm")
plt.show()


In [None]:
corr_matrix["MonthlyIncome"].sort_values(ascending=False)


In [None]:
df["SalaryBand"] = pd.qcut(df["MonthlyIncome"], 4)

pd.crosstab(df["SalaryBand"], df["Attrition"], normalize="index") * 100


In [None]:
df["TenureBand"] = pd.cut(df["YearsAtCompany"], bins=[0,3,7,15,40])

pd.crosstab(df["TenureBand"], df["Attrition"], normalize="index") * 100


In [None]:
from scipy.stats import ttest_ind

leave = df[df["Attrition"]=="Yes"]["MonthlyIncome"]
stay = df[df["Attrition"]=="No"]["MonthlyIncome"]

t_stat, p_val = ttest_ind(leave, stay)
print("P-value:", p_val)


In [None]:
df["SalaryBand"] = pd.qcut(df["MonthlyIncome"], 4, labels=["Low","Mid-Low","Mid-High","High"])


In [None]:
salary_segment = pd.crosstab(df["SalaryBand"], df["Attrition"], normalize="index") * 100
print(salary_segment)


In [None]:
df["TenureBand"] = pd.cut(df["YearsAtCompany"],
                          bins=[0,3,7,15,40],
                          labels=["0-3","3-7","7-15","15+"])


In [None]:
tenure_segment = pd.crosstab(df["TenureBand"], df["Attrition"], normalize="index") * 100
print(tenure_segment)


In [None]:
combo_segment = pd.crosstab(
    [df["OverTime"], df["SalaryBand"]],
    df["Attrition"],
    normalize="index"
) * 100

print(combo_segment)


In [None]:
df["PromotionGap"] = df["YearsAtCompany"] - df["YearsSinceLastPromotion"]


In [None]:
df["PromotionGapBand"] = pd.cut(df["PromotionGap"],
                                bins=[-1,1,3,6,20],
                                labels=["Recent","Moderate","Long","Very Long"])


In [None]:
promotion_segment = pd.crosstab(df["PromotionGapBand"], df["Attrition"], normalize="index") * 100
print(promotion_segment)


High Risk Segment Identification

In [None]:
df["HighRisk"] = (
    (df["OverTime"] == "Yes") &
    (df["YearsAtCompany"] < 3) &
    (df["JobSatisfaction"] <= 2)
)


In [None]:
risk_segment = pd.crosstab(df["HighRisk"], df["Attrition"], normalize="index") * 100
print(risk_segment)


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

X_numeric = df.select_dtypes(include=np.number)

vif_data = pd.DataFrame()
vif_data["Feature"] = X_numeric.columns
vif_data["VIF"] = [variance_inflation_factor(X_numeric.values, i)
                   for i in range(X_numeric.shape[1])]

print(vif_data.sort_values(by="VIF", ascending=False))


In [None]:
tenure_risk = pd.crosstab(
    df["YearsAtCompany"],
    df["Attrition"],
    normalize="index"
) * 100

tenure_risk.plot()
plt.title("Attrition Risk Curve by Tenure")
plt.show()


In [None]:
df.head()


In [None]:
df.describe()


In [None]:
df_original = pd.read_csv("../data/WA_Fn-UseC_-HR-Employee-Attrition.csv")


In [None]:
print("Original Columns:", df_original.shape[1])
print("Cleaned Columns:", df.shape[1])


In [None]:
df

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv("../data/WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

In [None]:
# If OverTime = Yes → 10 hours per day
# If OverTime = No → 8 hours per day

df['DailyHours'] = df['OverTime'].apply(lambda x: 10 if x == 'Yes' else 8)

# Assume 22 working days per month
df['MonthlyWorkingHours'] = df['DailyHours'] * 22

In [None]:
df['Salary_per_hour'] = df['MonthlyIncome'] / df['MonthlyWorkingHours']

In [None]:
dept_avg = df.groupby('Department')['Salary_per_hour'].mean()

df['Expected_salary'] = df['Department'].map(dept_avg)

In [None]:
df['Salary_Percentage'] = (
    df['Salary_per_hour'] / df['Expected_salary']
) * 100

In [None]:
def salary_category(x):
    if x < 90:
        return "Underpaid"
    elif x <= 110:
        return "Fair"
    else:
        return "Overpaid"

df['Salary_Category'] = df['Salary_Percentage'].apply(salary_category)

In [None]:
pd.crosstab(df['Salary_Category'], df['Attrition'], normalize='index') * 100