# <b> 5. Feature Selection<b/>

### <b>Import needed libraries</b>

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append(os.path.abspath('..'))
#### FEATURE SELECTION
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from sklearn.linear_model import LogisticRegression 

import warnings
warnings.filterwarnings('ignore')


## <b> 5.1 Using Train Dataset<b/>

In [22]:
df_train = pd.read_csv('C:/Users/P058886/Downloads/HR_Attrition_Train_Dataset.csv')

### <b> 5.1.1 Preprocessing</b>

In [23]:
df_train

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,47,No,Travel_Rarely,1225,Sales,2,4,Life Sciences,1,1676,...,3,80,3,29,2,3,3,2,1,2
1,22,No,Travel_Rarely,594,Research & Development,2,1,Technical Degree,1,169,...,3,80,1,3,2,3,2,1,2,1
2,46,No,Travel_Rarely,406,Sales,3,1,Marketing,1,1124,...,4,80,1,23,3,3,12,9,4,9
3,25,No,Travel_Rarely,622,Sales,13,1,Medical,1,645,...,3,80,0,7,1,3,7,4,0,6
4,43,No,Travel_Frequently,1001,Research & Development,9,5,Medical,1,663,...,2,80,1,10,3,3,8,7,4,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,23,Yes,Travel_Rarely,427,Sales,7,3,Life Sciences,1,1702,...,2,80,1,3,2,3,3,2,0,2
1172,38,No,Travel_Rarely,1009,Sales,2,2,Life Sciences,1,1355,...,4,80,1,11,3,3,7,7,1,7
1173,22,No,Travel_Rarely,217,Research & Development,8,1,Life Sciences,1,1019,...,1,80,1,4,3,2,4,3,1,1
1174,36,No,Travel_Rarely,430,Research & Development,2,4,Other,1,1847,...,4,80,1,15,2,3,1,0,0,0


In [24]:
# Outliers
def treat_outliers_iqr(df_train, cols, verbose=True):
    
    report = []

    for col in cols:
        if col not in df_train.columns:
            print(f"⚠️ Column '{col}' not found in DataFrame.")
            continue

        # Calculate IQR limits
        Q1 = df_train[col].quantile(0.25)
        Q3 = df_train[col].quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

        # Count outliers before clipping
        count_before = ((df_train[col] < lower) | (df_train[col] > upper)).sum()

        # Apply clipping
        df_train[col] = np.clip(df_train[col], lower, upper)

        # Count outliers after clipping
        count_after = ((df_train[col] < lower) | (df_train[col] > upper)).sum()

        # Save results
        report.append({
            "Column": col,
            "Q1": round(Q1, 3),
            "Q3": round(Q3, 3),
            "IQR": round(IQR, 3),
            "Lower Limit": round(lower, 3),
            "Upper Limit": round(upper, 3),
            "Outliers Before": count_before,
            "Outliers After": count_after,
            "% Changed": round(100 * (count_before - count_after) / len(df_train), 2)
        })

        if verbose:
            print(f"{col}: {count_before} → {count_after} outliers (treated {count_before - count_after})")

    report_df = pd.DataFrame(report)
    return df_train, report_df


In [25]:
cols_to_drop = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours']
df_train = df_train.drop(columns=cols_to_drop)

In [26]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [27]:
# Check the shape of the splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (940, 30)
X_test shape: (236, 30)
y_train shape: (940,)
y_test shape: (236,)


In [34]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

# Identify categorical columns
cat_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender',
            'JobRole', 'MaritalStatus', 'OverTime']
num_cols = X_train.select_dtypes(include=['number']).columns.tolist()

# ColumnTransformer
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False)
ct = ColumnTransformer(
    transformers=[('ohe', ohe, cat_cols)],
    remainder='passthrough'  # keep numeric columns
)

# Fit and transform training data
X_train_encoded_array = ct.fit_transform(X_train)

# Get column names and remove 'remainder__' prefix
feature_names = ct.get_feature_names_out()
feature_names = [name.replace('remainder__', '') for name in feature_names]

# Convert to DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded_array, columns=feature_names, index=X_train.index)

# Transform test data
X_test_encoded_array = ct.transform(X_test)
X_test_encoded = pd.DataFrame(X_test_encoded_array, columns=feature_names, index=X_test.index)

print("✅ Encoding complete")
print("X_train_encoded shape:", X_train_encoded.shape)
print("X_test_encoded shape:", X_test_encoded.shape)


✅ Encoding complete
X_train_encoded shape: (940, 44)
X_test_encoded shape: (236, 44)


### <b>Feature Selection using mutual information</b>

In [35]:
selector = SelectKBest(score_func=mutual_info_classif, k=20)
X_train_selected = selector.fit_transform(X_train_encoded, y_train)

# Apply same selection to test set
X_test_selected = X_test_encoded.iloc[:, selector.get_support()]

# Get selected feature names
selected_features = X_train_encoded.columns[selector.get_support()]
feature_scores = pd.DataFrame({
    'Feature': selected_features,
    'Score': selector.scores_[selector.get_support()]
}).sort_values(by='Score', ascending=False).reset_index(drop=True)

X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_features, index=X_train.index)

print("\n✅ Top features selected:")
print(feature_scores.head(10))
print("\nShape of reduced training dataset:", X_train_selected_df.shape)


✅ Top features selected:
                                 Feature     Score
0                          MonthlyIncome  0.047167
1                         YearsAtCompany  0.042140
2                       StockOptionLevel  0.032868
3                             HourlyRate  0.026210
4             ohe__MaritalStatus_Married  0.022272
5                                    Age  0.022086
6                      ohe__OverTime_Yes  0.021993
7                   YearsWithCurrManager  0.021613
8      ohe__JobRole_Sales Representative  0.019107
9  ohe__BusinessTravel_Travel_Frequently  0.018765

Shape of reduced training dataset: (940, 20)
