In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
X = pd.read_csv(r"C:\Users\ngche\Downloads\X.csv")
y = pd.read_csv(r"C:\Users\ngche\Downloads\y.csv")


In [4]:
len(X['job'].unique())

497

In [5]:
X = pd.read_csv(r"../X_job_categories.csv")


In [6]:
X_encoded = pd.get_dummies(X,columns = ['category', 'gender', 'state', 'day_of_week', 'job_categories'])
X_encoded = X_encoded.drop('job', axis=1)


In [7]:
len(X_encoded.columns)

90

Dimensionality is pretty high, might want to use PCA

In [8]:
#splitting data using stratified sampling:
y = y.values.ravel() if hasattr(y, "values") else y.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, stratify=y, test_size=0.3, random_state=42) 

## MLP Model without scaling

In [25]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train, y_train)



In [26]:
y_pred = mlp.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary') 
recall = recall_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_test)

roc_auc = roc_auc_score(y_test, y_pred_probs[:, 1])

metrics_base = [accuracy, f1, recall, precision, roc_auc]

print(metrics_base)

[0.9963902619849241, 0.5968649517684889, 0.5129533678756477, 0.7135992311388756, 0.9335806400682897]


## MLP Model with scaled data only

In [9]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

In [21]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_scaled, y_train)

In [22]:
y_pred = mlp.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary') 
recall = recall_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_test_scaled)

roc_auc = roc_auc_score(y_test, y_pred_probs[:, 1])

metrics_base_scaled = [accuracy, f1, recall, precision, roc_auc]

print(metrics_base_scaled)

[0.9982545135221218, 0.8074632790789996, 0.7025906735751295, 0.9491367242183855, 0.9921181009939932]


## MLP Model with PCA

In [11]:
pca = PCA().fit(X_train_scaled)
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1 
n_components

75

In [12]:
pca = PCA(n_components=n_components)
fit = pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled) 
explained_variance_ratio = pca.explained_variance_ratio_
print(f'Explained variance ratio: {explained_variance_ratio}')

Explained variance ratio: [0.03391347 0.03350537 0.02351505 0.02044315 0.0198322  0.01539367
 0.01525541 0.01471527 0.01416308 0.01378343 0.01375313 0.01357329
 0.01347054 0.01326876 0.01305605 0.01298164 0.01278021 0.01273504
 0.0125386  0.01238232 0.01225535 0.01218248 0.01211345 0.01209055
 0.01198144 0.01196145 0.01194055 0.01189932 0.01183469 0.0117498
 0.01171798 0.01163366 0.01158282 0.01153059 0.01149222 0.01145279
 0.01143401 0.01142792 0.01140731 0.01138212 0.01136037 0.01134771
 0.01134411 0.01134283 0.01132966 0.01132547 0.01132477 0.01131518
 0.01130618 0.01129696 0.01129544 0.01128725 0.01127107 0.01125822
 0.01125026 0.01123397 0.01122672 0.01121908 0.01121538 0.01120204
 0.01119031 0.01118574 0.0111833  0.01116905 0.01114789 0.01113856
 0.01112116 0.01110713 0.01099037 0.010343   0.01009653 0.00995149
 0.00962127 0.00935907 0.00919291]


In [17]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_pca, y_train)

In [20]:
y_pred = mlp.predict(X_test_pca)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary') 
recall = recall_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_test_pca)

roc_auc = roc_auc_score(y_test, y_pred_probs[:, 1])

metrics_pca = [accuracy, f1, recall, precision, roc_auc]

print(metrics_pca)

[0.9975977067546727, 0.7330533893221355, 0.633160621761658, 0.8703703703703703, 0.9898225820548672]


In [24]:
pca = PCA(n_components=5)
fit = pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled) 
explained_variance_ratio = pca.explained_variance_ratio_
print(f'Explained variance ratio: {explained_variance_ratio}')

Explained variance ratio: [0.03391342 0.0335053  0.02351018 0.02042383 0.01977846]


## Results

In [27]:
all_metrics = [metrics_base, metrics_base_scaled, metrics_pca]

df_metrics = pd.DataFrame(all_metrics, columns=['accuracy', 'f1', 'recall', 'precision', 'roc_auc'])

row_names = ['MLP Base Model','MLP Base Model with Scaling', 'MLP PCA model']

df_metrics.index = row_names

df_metrics

Unnamed: 0,accuracy,f1,recall,precision,roc_auc
MLP Base Model,0.99639,0.596865,0.512953,0.713599,0.933581
MLP Base Model with Scaling,0.998255,0.807463,0.702591,0.949137,0.992118
MLP PCA model,0.997598,0.733053,0.633161,0.87037,0.989823


## Hyperparameter tuning based on results:

In [None]:
param_grid = {
    'hidden_layer_sizes': [(10,10),(20, 20)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 0.1],
    'learning_rate': ['invscaling', 'adaptive'],
    'max_iter': [50, 1000],
    'random_state': [1],
    'learning_rate_init': [0.001, 0.1],
    'validation_fraction': [0.1, 0.2]
}

mlp = MLPClassifier()

grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_mlp = grid_search.best_estimator_

In [None]:
best_params = best_mlp.get_params()

for param_name, param_value in best_params.items():
    print(f'{param_name}: {param_value}')

In [None]:
y_pred = best_mlp.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')

y_pred_probs = best_mlp.predict_proba(X_test)

roc_auc = roc_auc_score(y_test, y_pred_probs[:, 1])

metrics_after_tuning = [accuracy, f1, recall, precision, roc_auc]

print(metrics_after_tuning)

## SVM 

In [13]:
gamma=[1, 0.1, 0.01, 0.001]
for g in gamma:
    svc = SVC(kernel='poly', C=10, gamma=g, probability=True, random_state=27)
    # Fit the model on the training data
    svc.fit(X_train_pca, y_train)

    # Calculate and print the accuracy score on the training data
    train_accuracy = svc.score(X_train_pca, y_train)
    
    # Calculate and print the accuracy score on the test data
    print("[Gamma: {}], Training Accuracy: {}".format(g, train_accuracy))