# Initial SVC Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

In [44]:
df = pd.read_csv("./creditcard.csv")
df.shape

(284807, 31)

In [45]:
df.Class.value_counts().max() / df.Class.value_counts().sum()

0.9982725143693799

In [46]:
df = df.dropna()

In [47]:
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [48]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
svm = SVC(kernel='linear', random_state=0)
svm.fit(X_train, y_train)

In [14]:
y_pred = svm.predict(X_test)

In [19]:
print("Accuracy (roc_auc_score):", roc_auc_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8927076633973792

Confusion Matrix:
 [[56847    17]
 [   21    77]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.82      0.79      0.80        98

    accuracy                           1.00     56962
   macro avg       0.91      0.89      0.90     56962
weighted avg       1.00      1.00      1.00     56962



# Random Undersampling + Anomaly Detection

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

df = pd.read_csv("./creditcard.csv")

X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
def train_evaluate(undersampling_ratio, X_train, y_train, X_test, y_test):
    # random undersampling
    rus = RandomUnderSampler(sampling_strategy=undersampling_ratio, random_state=0)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    
    # Filter out non-fradulent data from the resampled training set
    X_train_normal = X_train_resampled[y_train_resampled == 0]

    # Train a One-Class SVM model on the non-fradulent data
    one_class_svm = OneClassSVM(kernel='rbf', nu=0.01, gamma='scale')
    one_class_svm.fit(X_train_normal)
    y_pred = one_class_svm.predict(X_test)

    # # Convert the predicted labels to match the original labels (0 for normal, 1 for anomaly/fraud)
    y_pred = np.where(y_pred == 1, 0, 1)

    print(f"Undersampling Ratio: {undersampling_ratio * 100}%")
    print("Accuracy (roc_auc_score):", roc_auc_score(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\n")

In [8]:
# Ratio Selection
print(f"Percentage of non-fradulent records: majority class:{(df.Class.value_counts().max() / df.Class.value_counts().sum()).round(3)}")
print(f"Percentage of fraud records (minority class):{(df.Class.value_counts().min() / df.Class.value_counts().sum()).round(3)}")

Percentage of non-fradulent records: majority class:0.998
Percentage of fraud records (minority class):0.002


In [22]:
undersampling_ratios = [0.1]

for ratio in undersampling_ratios:
    train_evaluate(ratio, X_train, y_train, X_test, y_test)

Undersampling Ratio: 10.0%
Accuracy (roc_auc_score): 0.9240978116063533

Confusion Matrix:
 [[53454  3410]
 [    9    89]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97     56864
           1       0.03      0.91      0.05        98

    accuracy                           0.94     56962
   macro avg       0.51      0.92      0.51     56962
weighted avg       1.00      0.94      0.97     56962





# Week 2 Improvements

# Undersampling Improvements

In [2]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss,TomekLinks, EditedNearestNeighbours, CondensedNearestNeighbour, OneSidedSelection
from collections import Counter

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

df = pd.read_csv("./creditcard.csv")

X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [41]:
def undersampling_distribution(name, X_train, y_train):
    undersampling_methods = {
        "RandomUnderSampler": RandomUnderSampler(sampling_strategy=0.1, random_state=0),
        "NearMiss": NearMiss(version=1),
        "TomekLinks": TomekLinks(sampling_strategy='majority'),
        "EditedNearestNeighbours": EditedNearestNeighbours(sampling_strategy='majority'),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(random_state=0),
        "OneSidedSelection": OneSidedSelection(random_state=0)
    }
    method = undersampling_methods[name]
    X_resampled, y_resampled = method.fit_resample(X_train, y_train)
    
    # Print the number of instances in each class for the resampled data
    print(f"Method: {name}")
    print("Class distribution:", Counter(y_resampled))
    print("\n")
    return X_resampled, y_resampled

In [39]:
print("Original class distribution:", Counter(y_train))

Original class distribution: Counter({0: 227454, 1: 391})


In [42]:
randomX,randomY = undersampling_distribution("RandomUnderSampler",X_train, y_train)

Method: RandomUnderSampler
Class distribution: Counter({0: 3910, 1: 391})




In [None]:
nearmissX, nearmissY = undersampling_distribution("NearMiss",X_train, y_train)

In [None]:
tomekX, tomekY = undersampling_distribution("TomekLinks",X_train, y_train)

In [None]:
ennX, ennY = undersampling_distribution("EditedNearestNeighbours",X_train, y_train)



In [None]:
cnnX, cnnY = undersampling_distribution("CondensedNearestNeighbour",X_train, y_train)

In [None]:
ossX, ossY = undersampling_distribution("OneSidedSelection",X_train, y_train)



In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import ParameterGrid

def custom_scorer(y, y_pred):
    # Convert the predicted labels to match the original labels (0 for normal, 1 for anomaly/fraud)
    y_pred = np.where(y_pred == 1, 0, 1)
    return roc_auc_score(y, y_pred)

best_score = 0
best_params = None

X_train_resampled, y_train_resampled = undersampling_distribution("RandomUnderSampler",X_train, y_train)

# Filter out non-fraudulent data from the resampled training set
X_train_normal = X_train_resampled[y_train_resampled == 0]

# Define the hyperparameter grid
param_grid = {
    'nu': [.015,.014,.013,.012,.011,.010,.09,.08,.07,.05,.009],
    'gamma': ['scale', 'auto'],
}

# Iterate over all combinations of hyperparameters
for params in ParameterGrid(param_grid):
    one_class_svm = OneClassSVM(**params)
    one_class_svm.fit(X_train_normal)
    
    y_test_pred = one_class_svm.predict(X_test)
    
    score = custom_scorer(y_test, y_test_pred)
    
    if score > best_score:
        best_score = score
        best_params = params

# Finally, train a model with the best parameters on the full training set and evaluate it on the test set
one_class_svm_best = OneClassSVM(kernel='rbf', **best_params)
one_class_svm_best.fit(X_train_normal)
y_pred = one_class_svm_best.predict(X_test)

# Convert the predicted labels to match the original labels (0 for normal, 1 for anomaly/fraud)
y_pred = np.where(y_pred == 1, 0, 1)

print(f"Best Parameters:", best_params)
print("Accuracy (roc_auc_score):", roc_auc_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\n")

Method: RandomUnderSampler
Class distribution: Counter({0: 3910, 1: 391})


Best Parameters: {'gamma': 'scale', 'nu': 0.015}
Accuracy (roc_auc_score): 0.9198325567594834

Confusion Matrix:
 [[53374  3487]
 [   10    91]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97     56861
           1       0.03      0.90      0.05       101

    accuracy                           0.94     56962
   macro avg       0.51      0.92      0.51     56962
weighted avg       1.00      0.94      0.97     56962





In [31]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import ParameterGrid

def custom_scorer(y, y_pred):
    # Convert the predicted labels to match the original labels (0 for normal, 1 for anomaly/fraud)
    y_pred = np.where(y_pred == 1, 0, 1)
    return roc_auc_score(y, y_pred)

best_score = 0
best_params = None

# random undersampling
rus = RandomUnderSampler(sampling_strategy=.1, random_state=0)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Filter out non-fraudulent data from the resampled training set
X_train_normal = X_train_resampled[y_train_resampled == 0]

# Define the hyperparameter grid
param_grid = {
    'nu': [0.1],
    'gamma': ['scale']
}

# Iterate over all combinations of hyperparameters
for params in ParameterGrid(param_grid):
    # Train a One-Class SVM model on the non-fraudulent data
    one_class_svm = OneClassSVM(kernel='rbf', **params)
    one_class_svm.fit(X_train_normal)
    
    # Get predictions on the validation set
    y_test_pred = one_class_svm.predict(X_test)
    
    # Compute the custom score
    score = custom_scorer(y_test, y_test_pred)
    
    # If the score is better than the current best, update the best score and best parameters
    if score > best_score:
        best_score = score
        best_params = params

# Finally, train a model with the best parameters on the full training set and evaluate it on the test set
one_class_svm_best = OneClassSVM(kernel='rbf', **best_params)
one_class_svm_best.fit(X_train_normal)
y_pred = one_class_svm_best.predict(X_test)

# Convert the predicted labels to match the original labels (0 for normal, 1 for anomaly/fraud)
y_pred = np.where(y_pred == 1, 0, 1)

print(f"Best Parameters:", best_params)
print("Accuracy (roc_auc_score):", roc_auc_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\n")

Best Parameters: {'gamma': 'scale', 'nu': 0.1}
Accuracy (roc_auc_score): 0.9058458850619594

Confusion Matrix:
 [[50798  6066]
 [    8    90]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94     56864
           1       0.01      0.92      0.03        98

    accuracy                           0.89     56962
   macro avg       0.51      0.91      0.49     56962
weighted avg       1.00      0.89      0.94     56962



