# Job Change Prediction
**ML Classification - Best Model Selection**

Task: Predict whether a person is willing to change job

Metric: Balanced Accuracy

## 1. Setup & Data Loading

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('job_change_train.csv')
test = pd.read_csv('job_change_test.csv')

print(f'Training samples: {len(train)}')
print(f'Test samples: {len(test)}')
print(f'\nTarget distribution:')
print(train['willing_to_change_job'].value_counts())

Training samples: 12427
Test samples: 3308

Target distribution:
willing_to_change_job
No     9340
Yes    3087
Name: count, dtype: int64


## 2. Data Preprocessing

In [3]:
def preprocess(df):
    df = df.copy()
    
    # Convert ordinal string features to numeric
    ysc_map = {'never_changed': 0, '1': 1, '2': 2, '3': 3, '4': 4, '>4': 5, 'unknown': -1}
    df['years_since_job_change'] = df['years_since_job_change'].map(ysc_map)
    
    def parse_exp(x):
        if x == '>20': return 21
        if x == '<1': return 0
        if x == 'unknown': return -1
        return int(x)
    df['years_of_experience'] = df['years_of_experience'].apply(parse_exp)
    
    return df

train = preprocess(train)
test = preprocess(test)

In [4]:
X = train.drop(['id', 'willing_to_change_job'], axis=1)
y = (train['willing_to_change_job'] == 'Yes').astype(int)
X_test = test.drop(['id'], axis=1)

numeric_features = ['age', 'relative_wage', 'years_since_job_change',
                   'years_of_experience', 'hours_of_training', 'is_certified']
categorical_features = ['gender', 'education', 'field_of_studies',
                       'is_studying', 'county', 'size_of_company', 'type_of_company']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(balanced_accuracy_score)

## 3. Model Comparison (5 Algorithms)

In [5]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=31),
    'SVM (RBF)': SVC(kernel='rbf', class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.05, scale_pos_weight=3, random_state=42, verbosity=0)
}

print('Model Comparison (5-Fold CV, Balanced Accuracy)')
print('=' * 50)

results = {}
for name, model in models.items():
    pipe = Pipeline([('prep', preprocessor), ('clf', model)])
    scores = cross_val_score(pipe, X, y, cv=cv, scoring=scorer, n_jobs=-1)
    results[name] = scores.mean()
    print(f'{name:25} | {scores.mean():.4f} (+/- {scores.std():.4f})')

best_model_name = max(results, key=results.get)
print('=' * 50)
print(f'\nBest Model: {best_model_name} ({results[best_model_name]:.4f})')

Model Comparison (5-Fold CV, Balanced Accuracy)
Logistic Regression       | 0.7610 (+/- 0.0085)
KNN                       | 0.6933 (+/- 0.0127)
SVM (RBF)                 | 0.7662 (+/- 0.0108)
Random Forest             | 0.7631 (+/- 0.0119)
XGBoost                   | 0.7685 (+/- 0.0105)

Best Model: XGBoost (0.7685)


## 4. Hyperparameter Tuning (Best Model: XGBoost)

In [6]:
xgb_pipe = Pipeline([('prep', preprocessor), ('clf', XGBClassifier(random_state=42, verbosity=0))])

param_grid = {
    'clf__n_estimators': [100, 150, 200],
    'clf__max_depth': [3, 4, 5, 6],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__scale_pos_weight': [2.5, 3, 3.5],
    'clf__subsample': [0.8, 1.0],
    'clf__colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(xgb_pipe, param_grid, cv=cv, scoring=scorer, n_jobs=-1, verbose=1)
grid_search.fit(X, y)

print(f'\nBest Balanced Accuracy: {grid_search.best_score_:.4f}')
print(f'Best Parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 432 candidates, totalling 2160 fits

Best Balanced Accuracy: 0.7719
Best Parameters: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.05, 'clf__max_depth': 5, 'clf__n_estimators': 100, 'clf__scale_pos_weight': 3, 'clf__subsample': 0.8}


## 5. Final Model & Predictions

In [7]:
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'willing_to_change_job': predictions
})

submission.to_csv('predictions.csv', index=False)

print('Predictions saved to predictions.csv')
print(f'Distribution: 1={sum(predictions)}, 0={len(predictions)-sum(predictions)}')

Predictions saved to predictions.csv
Distribution: 1=1072, 0=2236


## 6. Summary

| Model | Balanced Accuracy |
|-------|------------------|
| **XGBoost** | **0.7719** |
| SVM (RBF) | 0.7662 |
| Logistic Regression | 0.7645 |
| Random Forest | 0.7634 |
| KNN | 0.6933 |

**Selected Model:** XGBoost

**Expected Balanced Accuracy on Test:** 0.7719