In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from imblearn.combine import SMOTEENN

In [8]:
# data_src_path = './data/train.csv'
data_src_path = '../../data/original/train.csv'

data = pd.read_csv(data_src_path)
X = data.iloc[:, 2:].values
y = data['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.995, random_state=42)

### Opt-1. set svc model class_weight='balanced'

In [10]:
# Ways to deal with imbalanced data
# Opt-1. set svc model class_weight='balanced'
svc = svm.SVC(
    C=1.0, 
    kernel='rbf', 
    degree=3, 
    gamma='scale', 
    class_weight='balanced'
)
svc.fit(X_train, y_train)

# predict
y_pred = svc.predict(X_test)

# evaluate
def evaluate(y_true, y_pred):
    auc = roc_auc_score(y_true, y_pred)
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    return auc, fpr, tpr, acc

evaluate(y_test, y_pred)

(0.5116142297363115,
 array([0.        , 0.42305866, 1.        ]),
 array([0.        , 0.44628712, 1.        ]),
 0.5721789962109699)

### Opt-2. use SMOTE-ENN to generate synthetic samples

In [15]:
# resample data
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# train
svc = svm.SVC(
    C=1.0, 
    kernel='rbf', 
    degree=3, 
    gamma='scale'
)
svc.fit(X_resampled, y_resampled)

# predict
y_pred = svc.predict(X_test)

# evaluate
evaluate(y_test, y_pred)

(0.4976934891476839,
 array([0.        , 0.63128648, 1.        ]),
 array([0.        , 0.62667346, 1.        ]),
 0.37811615639711194)

In [16]:
from imblearn.over_sampling import SMOTE

# resample using smote
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# train
svc = svm.SVC(
    C=1.0, 
    kernel='rbf', 
    degree=3, 
    gamma='scale'
)
svc.fit(X_resampled, y_resampled)

# predict
y_pred = svc.predict(X_test)

# evaluate
evaluate(y_test, y_pred)

(0.5054119602227722,
 array([0.        , 0.41387613, 1.        ]),
 array([0.        , 0.42470005, 1.        ]),
 0.5802399719030927)