In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

# Load datasets
train_data = pd.read_csv(r"E:\FABIZ\MASTER II\digital transformation\job_change_train.csv")
test_data = pd.read_csv(r"E:\FABIZ\MASTER II\digital transformation\job_change_test.csv")

# Preprocessing

def preprocess_data(train, test):
    # Map target variable
    train['willing_to_change_job'] = train['willing_to_change_job'].map({'Yes': 1, 'No': 0})

    # Simplify irregular years column processing
    def convert_years(value):
        if isinstance(value, str) and ('>' in value or '<' in value or value in ['never_changed', 'unknown']):
            return -1
        return int(value)

    train['years_since_job_change'] = train['years_since_job_change'].apply(convert_years)
    test['years_since_job_change'] = test['years_since_job_change'].apply(convert_years)

    train['years_of_experience'] = train['years_of_experience'].apply(convert_years)
    test['years_of_experience'] = test['years_of_experience'].apply(convert_years)

    # Encode categorical variables
    categorical_columns = ['gender', 'education', 'field_of_studies', 'is_studying',
                           'county', 'size_of_company', 'type_of_company']
    encoder = LabelEncoder()
    for col in categorical_columns:
        train[col] = encoder.fit_transform(train[col])
        test[col] = encoder.transform(test[col])

    return train, test

train, test = preprocess_data(train_data, test_data)

# Split features and target
X = train.drop(columns=['willing_to_change_job', 'id'])
y = train['willing_to_change_job']

# Apply SMOTE to address class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split resampled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Function to get macro avg accuracy
from sklearn.metrics import make_scorer
macro_avg_scorer = make_scorer(balanced_accuracy_score)

# 1. Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}
log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
grid_search_lr = GridSearchCV(log_reg, param_grid_lr, scoring=macro_avg_scorer, cv=5, n_jobs=-1)
grid_search_lr.fit(X_train, y_train)
best_log_reg = grid_search_lr.best_estimator_
cross_val_scores_lr = cross_val_score(best_log_reg, X_train, y_train, cv=5, scoring=macro_avg_scorer)
print("Logistic Regression Macro Avg Accuracy:", cross_val_scores_lr.mean())

# 2. Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, scoring=macro_avg_scorer, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_
cross_val_scores_rf = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring=macro_avg_scorer)
print("Random Forest Macro Avg Accuracy:", cross_val_scores_rf.mean())

# 3. AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}
ada_model = AdaBoostClassifier(random_state=42)
grid_search_ada = GridSearchCV(ada_model, param_grid_ada, scoring=macro_avg_scorer, cv=5, n_jobs=-1)
grid_search_ada.fit(X_train, y_train)
best_ada_model = grid_search_ada.best_estimator_
cross_val_scores_ada = cross_val_score(best_ada_model, X_train, y_train, cv=5, scoring=macro_avg_scorer)
print("AdaBoost Macro Avg Accuracy:", cross_val_scores_ada.mean())

# 4. Support Vector Machine (SVM)
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
svm_model = SVC(class_weight='balanced', random_state=42)
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, scoring=macro_avg_scorer, cv=5, n_jobs=-1)
grid_search_svm.fit(X_train_scaled, y_train)
best_svm_model = grid_search_svm.best_estimator_
cross_val_scores_svm = cross_val_score(best_svm_model, X_train_scaled, y_train, cv=5, scoring=macro_avg_scorer)
print("SVM Macro Avg Accuracy:", cross_val_scores_svm.mean())

# 5. K-Nearest Neighbors (KNN)
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}
knn_model = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, scoring=macro_avg_scorer, cv=5, n_jobs=-1)
grid_search_knn.fit(X_train_scaled, y_train)
best_knn_model = grid_search_knn.best_estimator_
cross_val_scores_knn = cross_val_score(best_knn_model, X_train_scaled, y_train, cv=5, scoring=macro_avg_scorer)
print("KNN Macro Avg Accuracy:", cross_val_scores_knn.mean())

# 6. Elastic Net (SGDClassifier)
param_grid_enet = {
    'alpha': [0.0001, 0.001, 0.01],
    'l1_ratio': [0.1, 0.5, 0.9]
}
elastic_net_model = SGDClassifier(
    loss='log_loss',
    penalty='elasticnet',
    class_weight='balanced',
    random_state=42,
    max_iter=1000
)
grid_search_enet = GridSearchCV(elastic_net_model, param_grid_enet, scoring=macro_avg_scorer, cv=5, n_jobs=-1)
grid_search_enet.fit(X_train, y_train)
best_enet_model = grid_search_enet.best_estimator_
cross_val_scores_enet = cross_val_score(best_enet_model, X_train, y_train, cv=5, scoring=macro_avg_scorer)
print("Elastic Net Macro Avg Accuracy:", cross_val_scores_enet.mean())

Logistic Regression Macro Avg Accuracy: 0.7502656333248267
Random Forest Macro Avg Accuracy: 0.8443518108106897
AdaBoost Macro Avg Accuracy: 0.8079484045434805
SVM Macro Avg Accuracy: 0.8012572027239393
KNN Macro Avg Accuracy: 0.7976442223744478
Elastic Net Macro Avg Accuracy: 0.7154305964101668
