In [1]:
''' import packages '''

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the data
HR = pd.read_csv('data/aug_train.csv')

In [3]:
HR = HR.fillna('MISSING')

In [4]:
#Separate training features from target
X = HR.drop(['target'], axis=1)
y = HR['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [13]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

numeric_features = X.select_dtypes(exclude=['object']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

In [14]:
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7826200417536534
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.91      0.86      2877
         1.0       0.60      0.39      0.47       955

    accuracy                           0.78      3832
   macro avg       0.71      0.65      0.67      3832
weighted avg       0.76      0.78      0.77      3832

