In [2]:
import pandas as pd
import dill
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from joblib import dump

# Load the dataset
file_path = '../data/Employee.csv'
df = pd.read_csv(file_path)

# Define feature columns and target
categorical_features = ['Education', 'City', 'Gender', 'EverBenched', 'PaymentTier']
numerical_features = ['JoiningYear', 'Age', 'ExperienceInCurrentDomain']
target = 'LeaveOrNot'

# Splitting features and target
X = df[categorical_features + numerical_features]
y = df[target]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Save the model using dill
dill_filename = '../model/employee_pipeline.pkl'
with open(dill_filename, 'wb') as file:
    dill.dump(model, file)

# Save feature dictionary for Streamlit app
feature_dict = {
    'CATEGORICAL': {
        'Column Name': categorical_features,
        'Members': [df[col].unique().tolist() for col in categorical_features]
    },
    'NUMERICAL': {
        'Column Name': numerical_features
    }
}
dump(feature_dict, '../model/employee_feature_dict.pkl')


['../model/employee_feature_dict.pkl']