# SVC Model

In [16]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score

In [2]:
df = pd.read_csv("./creditcard.csv")
df.shape

(284807, 31)

In [3]:
df.Class.value_counts().max() / df.Class.value_counts().sum()

0.9982725143693799

In [4]:
df = df.dropna()

In [5]:
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
svm = SVC(kernel='linear', random_state=45, probability=True)
svm.fit(X_train, y_train)

In [17]:
y_pred = svm.predict(X_test)
y_pred_proba = svm.predict_proba(X_test)[:, 1]
auprc = average_precision_score(y_true=y_test, y_score=y_pred_proba)

In [18]:
print("Accuracy (roc_auc_score):", roc_auc_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAverage Precision Score:", auprc)

Accuracy (roc_auc_score): 0.9107328083892612

Confusion Matrix:
 [[56843    18]
 [   18    83]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.82      0.82      0.82       101

    accuracy                           1.00     56962
   macro avg       0.91      0.91      0.91     56962
weighted avg       1.00      1.00      1.00     56962


Average Precision Score: 0.7535384071829317


In [None]:
y_pred_proba = grid_search.predict_proba(X_test)[:,1]
average_precision_score(y_true=y, y_score=y_pred_proba)

# Random Undersampling + Anomaly Detection

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

df = pd.read_csv("./creditcard.csv")

X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
def train_evaluate(undersampling_ratio, X_train, y_train, X_test, y_test):
    # random undersampling
    rus = RandomUnderSampler(sampling_strategy=undersampling_ratio, random_state=0)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
    
    # Filter out non-fradulent data from the resampled training set
    X_train_normal = X_train_resampled[y_train_resampled == 0]

    # Train a One-Class SVM model on the non-fradulent data
    one_class_svm = OneClassSVM(kernel='rbf', nu=0.01, gamma='scale')
    one_class_svm.fit(X_train_normal)
    y_pred = one_class_svm.predict(X_test)

    # # Convert the predicted labels to match the original labels (0 for normal, 1 for anomaly/fraud)
    y_pred = np.where(y_pred == 1, 0, 1)

    print(f"Undersampling Ratio: {undersampling_ratio * 100}%")
    print("Accuracy (roc_auc_score):", roc_auc_score(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\n")

In [8]:
# Ratio Selection
print(f"Percentage of non-fradulent records: majority class:{(df.Class.value_counts().max() / df.Class.value_counts().sum()).round(3)}")
print(f"Percentage of fraud records (minority class):{(df.Class.value_counts().min() / df.Class.value_counts().sum()).round(3)}")

Percentage of non-fradulent records: majority class:0.998
Percentage of fraud records (minority class):0.002


In [22]:
undersampling_ratios = [0.1]

for ratio in undersampling_ratios:
    train_evaluate(ratio, X_train, y_train, X_test, y_test)

Undersampling Ratio: 10.0%
Accuracy (roc_auc_score): 0.9240978116063533

Confusion Matrix:
 [[53454  3410]
 [    9    89]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97     56864
           1       0.03      0.91      0.05        98

    accuracy                           0.94     56962
   macro avg       0.51      0.92      0.51     56962
weighted avg       1.00      0.94      0.97     56962





# Week 2 Improvements

# Undersampling Improvements

In [33]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss,TomekLinks, EditedNearestNeighbours, CondensedNearestNeighbour, OneSidedSelection
from collections import Counter

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,roc_auc_score, average_precision_score


from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import ParameterGrid
df = pd.read_csv("./creditcard.csv")
df = df.dropna()

X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape

(227845, 30)

In [35]:
def undersampling_distribution(name, X_train, y_train):
    undersampling_methods = {
        "RandomUnderSampler": RandomUnderSampler(sampling_strategy=0.1, random_state=0),
        "NearMiss": NearMiss(version=1),
        "TomekLinks": TomekLinks(sampling_strategy='majority'),
        "EditedNearestNeighbours": EditedNearestNeighbours(sampling_strategy='majority'),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(random_state=0),
        "OneSidedSelection": OneSidedSelection(random_state=0)
    }
    method = undersampling_methods[name]
    X_resampled, y_resampled = method.fit_resample(X_train, y_train)
    
    # Print the number of instances in each class for the resampled data
    print(f"Method: {name}")
    print("Class distribution:", Counter(y_resampled))
    print("\n")
    return X_resampled, y_resampled

In [39]:
print("Original class distribution:", Counter(y_train))

Original class distribution: Counter({0: 227454, 1: 391})


In [19]:
randomX,randomY = undersampling_distribution("RandomUnderSampler",X_train, y_train)

Method: RandomUnderSampler
Class distribution: Counter({0: 39100, 1: 391})




In [1]:
nearmissX, nearmissY = undersampling_distribution("NearMiss", X_train, y_train)

NameError: name 'undersampling_distribution' is not defined

In [4]:
tomekX, tomekY = undersampling_distribution("TomekLinks", X_train, y_train)

Method: TomekLinks
Class distribution: Counter({0: 28414, 1: 51})




In [None]:
ennX, ennY = undersampling_distribution("EditedNearestNeighbours", X_train, y_train)



In [None]:
cnnX, cnnY = undersampling_distribution("CondensedNearestNeighbour", X_train, y_train)

In [None]:
ossX, ossY = undersampling_distribution("OneSidedSelection", X_train, y_train)



In [36]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
from sklearn.metrics import f1_score

def custom_scorer(y, y_pred):
    # Convert the predicted labels to match the original labels (0 for normal, 1 for anomaly/fraud)
    y_pred = np.where(y_pred == -1, 1, 0)
    y_pred_proba = grid_search.predict_proba(X_test)
    return average_precision_score(y_true=y, y_score=y_pred)

best_score = 0
best_params = None

X_train_resampled, y_train_resampled = undersampling_distribution("RandomUnderSampler",X_train, y_train)
# X_train_resampled, y_train_resampled = tomekX, tomekY
# Filter out non-fraudulent data from the resampled training set
X_train_normal = X_train_resampled[y_train_resampled == 0]

# Define the hyperparameter grid
param_grid = {
    'nu': [.015,],
    'gamma': ['auto'],
}

# Iterate over all combinations of hyperparameters
for params in ParameterGrid(param_grid):
    one_class_svm = OneClassSVM(**params)
    one_class_svm.fit(X_train_normal)
    
    y_test_pred = one_class_svm.predict(X_test)
    
    score = custom_scorer(y_test, y_test_pred)
    
    if score > best_score:
        best_score = score
        best_params = params

# Finally, train a model with the best parameters on the full training set and evaluate it on the test set
one_class_svm_best = one_class_svm
one_class_svm_best.fit(X_train_normal)

Method: RandomUnderSampler
Class distribution: Counter({0: 3910, 1: 391})




NameError: name 'grid_search' is not defined

In [31]:
y_pred = one_class_svm_best.predict(X_test)
print(Counter(y_pred))
y_pred = np.where(y_pred == -1, 0, 1)
print(f"Best Parameters:", best_params)
print("Accuracy:", best_score) # This will now print the best F1 score
print("Accuracy (roc_auc_score):", roc_auc_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\n")

Counter({-1: 255631, 1: 696})
Best Parameters: {'gamma': 'auto', 'nu': 0.015}
Best F1 Score: 0.0034350074775672985
Accuracy (roc_auc_score): 0.4986400193836318

Confusion Matrix:
 [[255190    696]
 [   441      0]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    255886
           1       0.00      0.00      0.00       441

    accuracy                           1.00    256327
   macro avg       0.50      0.50      0.50    256327
weighted avg       1.00      1.00      1.00    256327



