## Baseline Model Training

This file is to develop a basic model for establishing a baseline classification performance.

##### 0. Imports

In [1]:
import torch
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

##### 1. Load data

In [2]:
data = torch.load('data.pt')
X = data['features']
y = data['labels']

##### 2. Split between train and test

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [4]:
print('X_train:', X_train.shape)
print('y_train', y_test.shape)
print()
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: torch.Size([629, 60660])
y_train torch.Size([310])

X_test: torch.Size([310, 60660])
y_test: torch.Size([310])


##### 3. Preprocess

Scale

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Feature selection

In [6]:
k = 1000
selector = SelectKBest(k=k, score_func=mutual_info_classif)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

##### 4. Fit the model

In [16]:
Cs = [0.01, 0.1, 1, 10]
penalties = ['l1', 'l2']

models = []

for C in Cs:
    for penalty in penalties:
        print(f'Model: C={C} penalty={penalty}')
        #
        # Define the model
        #
        model = LogisticRegression(solver='saga', 
                                   max_iter=10000,
                                   class_weight='balanced',
                                   C=C,
                                   penalty=penalty)
        #
        # Fit
        #
        model.fit(X_train, y_train)
        #
        # Save
        #
        models.append(model)

Model: C=0.01 penalty=l1
Model: C=0.01 penalty=l2
Model: C=0.1 penalty=l1
Model: C=0.1 penalty=l2
Model: C=1 penalty=l1
Model: C=1 penalty=l2
Model: C=10 penalty=l1
Model: C=10 penalty=l2


##### 5. Evaluate the model

In [17]:
y_train_preds, y_test_preds = [], []
for model in models:
    y_train_preds.append(model.predict(X_train))
    y_test_preds.append(model.predict(X_test))

In [18]:
for i, (model, y_train_pred, y_test_pred) in enumerate(zip(models, y_train_preds, y_test_preds)):
    params = model.get_params()
    print()
    print(f"--> C={params['C']} penalty={params['penalty']}")
    print('Train:')
    print(classification_report(y_train, y_train_pred))
    print(confusion_matrix(y_train, y_train_pred))
    print()
    print("ROC AUC:", roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]))
    print()
    print()
    print('Test:')
    print(classification_report(y_test, y_test_pred))
    print(confusion_matrix(y_test, y_test_pred))
    print()
    print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
    print()


--> C=0.01 penalty=l1
Train:
              precision    recall  f1-score   support

         0.0       0.87      0.89      0.88       415
         1.0       0.78      0.73      0.76       214

    accuracy                           0.84       629
   macro avg       0.82      0.81      0.82       629
weighted avg       0.84      0.84      0.84       629

[[371  44]
 [ 57 157]]

ROC AUC: 0.9035131178921293


Test:
              precision    recall  f1-score   support

         0.0       0.91      0.83      0.87       211
         1.0       0.70      0.83      0.76        99

    accuracy                           0.83       310
   macro avg       0.81      0.83      0.82       310
weighted avg       0.84      0.83      0.84       310

[[176  35]
 [ 17  82]]

ROC AUC: 0.921585523481258


--> C=0.01 penalty=l2
Train:
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       415
         1.0       0.91      0.94      0.92       214

    accura

Evaluation results:
| C | penalty | precision | recall | f1-score | ROC |
|---|---------|-----------|--------|----------|-----|
| 0.01 | l1 | 0.70 | 0.83 | 0.76 | 0.921 |
| 0.01 | l2 | 0.75 | 0.86 | 0.80 | 0.926 |
| 0.1 | l1 | 0.74 | 0.86 | 0.79 | 0.929 |
| 0.1 | l2 | 0.76 | 0.86 | 0.81 | 0.916 |
| 1 | l1 | 0.73 | 0.78 | 0.75 | 0.910 |
| 1 | l2 | 0.73 | 0.86 | 0.79 | 0.907 |
| 10 | l1 | 0.73 | 0.84 | 0.78 | 0.904 |
| 10 | l2 | 0.74 | 0.86 | 0.79 | 0.904 |

In [20]:
best_model = [model for model in models if model.get_params()['C'] == 0.1 and model.get_params()['penalty'] == 'l1'][0]
best_model.get_params()

{'C': 0.1,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 10000,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l1',
 'random_state': None,
 'solver': 'saga',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

##### 5. Save the model

In [21]:
# Save the mapping
import pickle
with open('baseline_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)