In [1]:
# Import statements
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Models
from sklearn.linear_model import SGDClassifier # SGD model

In [2]:
# Reading into data frame from file
data_frame = pd.read_csv('./creditcard.csv')

In [3]:
# Load models from files if desired
from pickle import load
load_models = True
save_models = False

if load_models:
    with open("SGDClassifier_model.pkl", "rb") as file:
        clf = load(file)
    with open("SGDClassifier_model_resampled.pkl", "rb") as file:
        clf_resampled = load(file)
else:
    clf, clf_resampled = None, None

In [4]:
# Data to fit and targets
feature_set = ["Time", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "Amount"]
target_set = ["Class"]

X = data_frame[feature_set]
y = data_frame[target_set]

In [5]:
# Splits data into 80-20 train-test
# Prevents data leakage (where training and test sets influence each other in scaling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [6]:
# Scale features
standard_scaler = StandardScaler()

# Standard scaling
X_train_standard_scaled = standard_scaler.fit_transform(X_train)
X_test_standard_scaled = standard_scaler.transform(X_test)

In [7]:
# Balancing data set with resampling

# Separate majority and minority classes in training set
X_train_majority = X_train[y_train["Class"] == 0]
y_train_majority = y_train[y_train["Class"] == 0]
X_train_minority = X_train[y_train["Class"] == 1]
y_train_minority = y_train[y_train["Class"] == 1]

# Oversample minority class
X_minority_upsampled, y_minority_upsampled = resample(
    X_train_minority, y_train_minority,
    replace=True,                      # sample with replacement
    n_samples=len(y_train_majority),  # match majority class
    random_state=1
)

# Combine majority and upsampled minority
X_train_resampled = pd.concat((X_train_majority, X_minority_upsampled))
y_train_resampled = pd.concat((y_train_majority, y_minority_upsampled))

In [8]:
# Balancing data set with resampling ON SCALED DATA

# Separate majority and minority classes in training set
X_train_majority_standard_scaled = X_train_standard_scaled[y_train["Class"] == 0]
X_train_minority_standard_scaled = X_train_standard_scaled[y_train["Class"] == 1]

# Oversample minority class
X_minority_upsampled_standard_scaled, y_minority_upsampled = resample(
    X_train_minority_standard_scaled, y_train_minority,
    replace=True,                      # sample with replacement
    n_samples=len(y_train_majority),  # match majority class
    random_state=1
)

# Combine majority and upsampled minority
X_train_resampled_standard_scaled = pd.concat((X_train_majority_standard_scaled, X_minority_upsampled_standard_scaled))
y_train_resampled = pd.concat((y_train_majority, y_minority_upsampled))

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [None]:
# Initialize model(s)
if clf == None:
    clf = SGDClassifier(random_state=1) # Apparently prefers standard scaling
if clf_resampled == None:
    clf_resampled = SGDClassifier(random_state=1)

In [None]:
# Collect predictions from non-resample trained model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Collect predictions from resample trained model
clf_resampled.fit(X_train_resampled, y_train_resampled)
y_resampled_pred = clf_resampled.predict(X_test)

# Collect predictions from non-resample trained model (standard scaled)
clf.fit(X_train_standard_scaled, y_train)
y_standard_scale_pred = clf.predict(X_test_standard_scaled)

# Collect predictions from resample trained model (standard scaled)
clf_resampled.fit(X_train_resampled_standard_scaled, y_train_resampled)
y_standard_scale_resampled_pred = clf_resampled.predict(X_test_standard_scaled)