In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install -r ../input/transactions/requirements_financial_transactions.txt
# !pip install 'scikit_learn==0.24.2'

In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
import pandas as pd
df = pd.read_csv("../input/transactions/transactions.csv")
df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = "Class"),
                                                    df["Class"],
                                                    test_size = 0.2,
                                                    stratify = df["Class"])
print(f"{y_train.size} train samples\n {y_test.size} test samples")

In [None]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier().fit(X_train, y_train)

In [None]:
import pickle

def save_model(model, filename='model.sav'):
    pickle.dump(model, open(filename, 'wb'))
    
def load_model(filename='model.sav'):
    model = pickle.load(open(filename, 'rb'))
    return model

In [None]:
import numpy as np
print("y_train frauds", np.count_nonzero(y_train))
print("y_test frauds", np.count_nonzero(y_test))

In [None]:
from sklearn.utils import resample

def resample_data(n_samples=0):
    minority_idx = y_train == 1
    majority_idx = np.logical_not(minority_idx)
    if n_samples <= 0:
        n_samples = y_train[majority_idx].shape[0]
        
    X_0, y_0 = resample(X_train[majority_idx],
                        y_train[majority_idx],
                        replace=True,
                        n_samples=n_samples,
                        random_state=123)
    
    X_1, y_1 = resample(X_train[minority_idx],
                        y_train[minority_idx],
                        replace=True,
                        n_samples=n_samples,
                        random_state=123)
    
    X = np.vstack((X_0, X_1))
    y = np.hstack((y_0, y_1))
 
    permut = np.random.permutation(X.shape[0])
    X = X[permut]
    y = y[permut]
    
    return X, y

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def pca():
    pca = PCA()
    errors = []

    for i in range(2, 10):
        pca = PCA(n_components=i)
        X_reduced = pca.fit_transform(X_train)
        X_reconstructed = pca.inverse_transform(X_reduced)

        error = np.mean((X_train - X_reconstructed) ** 2)
        error = np.mean(error)
        errors.append(error)
        print('i:', i)
        print('Error:', error)

    plt.plot(range(2,10), errors)

In [None]:
import seaborn as sns

def plot_best_features(): 
    """
    Displays the best features using the sklearn ExtraTreesClassifier.
    
    """
        
    data = df
    X = data.iloc[:,0:30]  #independent columns
    y = data.iloc[:,-1]    #target column i.e price range
    from sklearn.ensemble import ExtraTreesClassifier
    
    model = ExtraTreesClassifier()
    model.fit(X,y)
    print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
    #plot graph of feature importances for better visualization
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.show()

def plot_corr():
    """
    Displays the correlation matrix of the features.
    """
    corr = df.corr()
    plt.figure(figsize=(20, 10))
    sns.heatmap(df.corr(), annot=True)
    
# plot_best_features()
# plot_corr()

In [None]:
from sklearn.preprocessing import StandardScaler
clf = dummy
standardScaler = StandardScaler()

In [None]:
def leader_board_predict_fn(values):
    # YOUR CODE HERE
    values = standardScaler.transform(values)
    return clf.predict(values)

In [None]:
### LEADER BOARD TEST
from sklearn.metrics import roc_auc_score
# score = roc_auc_score(y_test, leader_board_predict_fn(X_test))
# print(f"Leaderboard Score: {score}")
### LEADER BOARD TEST

def leader_board_predict_fn_sklearn(standardScaler, clf, values):
    # YOUR CODE HERE
    values = standardScaler.transform(values)
    return clf.predict(values)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = {
    "LDA": LDA(),
    "SVM": SVC(),
    "DTREE": DecisionTreeClassifier(),
    "LOG_REG": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "NAIVE_BAYES": GaussianNB()
}

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

resampler = {
    "oversample": RandomOverSampler(random_state=12),
    "undersample": RandomUnderSampler(random_state=12),
    "smote": SMOTE(random_state=12),
}

In [None]:
def test_classifiers_resampler(classifiers, resampler):
    """
    Does combinations of the given classifiers and resamplers and executes the roc_auc_score.
    The data is scaled with StandardScaler before fitting the model.
    Uses preimplemented sklearn models and imblearn sampling to get an overview of classifier performance
    """
    for i in [5000, 7500, 10000, 20000, 50000]:
        
        X, y = resample_data(n_samples=i)
        standardScaler = StandardScaler()
        X = standardScaler.fit_transform(X, y)
        
        print("Resample to: N =", i)
        for key, classifier in classifiers.items():
            if i > 5000 and (key == "SVM" or key == "KNearest"):
                # continue because SVM and KNearest would take to long for such a problem size
                # without real benefit
                continue
            
            print("Classifier:", key)
            clf = classifier
            clf = clf.fit(X, y)
            score = roc_auc_score(y_test, leader_board_predict_fn_sklearn(standardScaler, clf, X_test))
            print(f"Leaderboard Score: {score}")

    for key_resample, resample in resampler.items():
        X, y = resample.fit_resample(X_train, y_train)
        print("Resampler:", key_resample)
        
        standardScaler = StandardScaler()
        X = standardScaler.fit_transform(X, y)
        for key, classifier in classifiers.items():
            if key == "SVM" or key == "KNearest":
                # Too big problem size for SVM, KNearest. SVM, KNearest performance tested above
                continue

            print("Classifier:", key)
            clf = classifier
            clf = clf.fit(X, y)
            score = roc_auc_score(y_test, leader_board_predict_fn_sklearn(standardScaler, clf, X_test))
            print(f"Leaderboard Score: {score}")
            
            
# UNCOMMENT the line below if you want to compare different classifiers and resamplers
# test_classifiers_resampler(classifiers, resampler)

In [None]:
class Classifier:
    """
    Basic classifier defining methods train and predict.
    This class does no classifications. Use Sub-classes instead
    """
    def train(self, X, y):
        pass

    def predict(self, X):
        pass

In [None]:
class LogisticRegression(Classifier):

    def __init__(self):
        self.w = None

    def sigmoid(self, z):
        """
        Function that computes the sigmoid of the input values.

        :param z: input values
        :returns: sigmoid values for each input value
        """

        return 1 / (1 + np.exp(-z))

    def loss_function_gradient(self, w, x, y):
        """
        Function that computes the empirical loss for a logistic regression model.

        :param w: Weights vector
        :param x: Training input data
        :param y: Training target labels
        :returns: gradient of the loss
        """

        N = y.shape[0]

        f_x = self.sigmoid(x @ w)

        gradient = np.dot(x.T, (f_x - y)) / N 

        return gradient

    def batch_gradient_descent(self, x, y, alpha=0.01, num_steps=5000):
        """
        Implementation of the gradient descent algorithm for logistic regression

        :param: x: Training input data
        :param: y: Training target labels
        :param: alpha: Scalar learning rate
        :param: num_steps: Number of gradient descent steps
        :returns: weight vector 'w'
        """

        # Initialize the weights to zero
        w = np.zeros((x.shape[1]))

        for i in range(num_steps):
            w = w - alpha * self.loss_function_gradient(w, x, y)

        return w

    def train(self, X, y):
        self.w = self.batch_gradient_descent(X, y)
        print("Finished training")

    def predict(self, x):
        """
        Assign input to a class using the logistic regression model.

        :param: w: Weight vector
        :param: x: Test input data
        :returns: Predicted class labels (0 or 1)
        """

        if self.w is None:
            print("Weights not specified. Train the model first")
            return

        f_x = self.sigmoid(x @ self.w)
#         predictions = np.round(f_x)
        
        predictions = f_x

        return predictions


In [None]:
def train_LogisticRegression():
    """
    Trains a LogisticRegression model using RandomOverSampler and the LogisticRegression class.
    :returns: the standardScaler and the trained model
    """
    clf = LogisticRegression()
    standardScaler = StandardScaler()

    X, y = RandomUnderSampler(sampling_strategy=0.2).fit_resample(X_train, y_train)
    X = standardScaler.fit_transform(X, y)

    clf.train(X, y)
    
    return standardScaler, clf

standardScaler, clf = train_LogisticRegression()

In [None]:
## LEADER BOARD TEST
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, leader_board_predict_fn(X_test))
print(f"Leaderboard Score: {score}")
## LEADER BOARD TEST

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Check https://pytorch.org/docs/stable/notes/randomness.html#reproducibility
torch.manual_seed(123)
print("gpu available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Define hyperparameters
LEARNING_RATE = 0.0002
INPUT_SIZE = 30
HIDDEN_SIZE = 11
OUTPUT_SIZE = 1

NUM_EPOCHS = 3
BATCH_SIZE = 64

MODEL_NAME = 'model.pt'
SCALER_NAME = 'scaler.sav'

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)
        
        
        
    def forward(self, x):
        # Flatten the input x keeping the batch dimension the same
        # Use the relu activation functions 
        # Pass x through functions but do not apply any activation function
        
        
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        
        return x  # Return x (logits)

In [None]:
def binary_acc(y_pred, y_test):
    """
    Calculates the accuracy of the predicted values y_pred in comparison to y_test.
    :returns: the accuracy
    """
    predictions = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (predictions == y_test).sum().float()
    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
from tqdm.notebook import tqdm

def train_neural_network_pytorch(net, train_loader, optimizer, criterion, num_epochs):
    """
    Function for training the PyTorch network.
    
    :param net: the neural network object
    :param inputs: numpy array of training data values
    :param labels: numpy array of training data labels 
    :param optimizer: PyTorch optimizer instance
    :param criterion: PyTorch loss function
    :param iterations: number of training steps
    """    
    net.train()  # Before training, set the network to training mode
    
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        epoch_acc = 0.0
        for batch_idx, (inputs, labels) in enumerate(tqdm(train_loader)):
            inputs = inputs.to(device)
            labels = labels = labels.unsqueeze(1).to(device)

            # 1. Zero parameter gradients
            # 2. Forward
            # 3. Compute loss
            # 4. Backward
            # 5. Update step

            optimizer.zero_grad()
            outputs = net.forward(inputs)
            loss = criterion(outputs, labels)
            # calculate current accuracy
            acc = binary_acc(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            if batch_idx % 1000 == 999:
                print(f'Loss: {loss.item():.5f}')
            
        print(f'Epoch {epoch}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
        
    print('Finished Training')

In [None]:
# Initialize the network
net = Net(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
net = net.to(device)

# Define the loss criterion and the training algorithm

criterion = nn.BCEWithLogitsLoss().to(device)  # binary cross entropy

# Using Adam optimizer instead of SGD
# Adam was faster converging to the (nearly) optimum
# Adam Optimizer does not need a momentum
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [None]:
import torch
from torch.utils.data import Dataset

class trainData(Dataset):
    """
    Basic class to store a dataset as torch-Dataset that can be used for torch epochs.
    """
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [None]:
from torch.utils.data import DataLoader

def train_nn():
    """
    Preprocesses data, transforms data to torch datasets and starts training the NN-model.
    """
    X, y = SMOTE(random_state=12).fit_resample(X_train.values, y_train.values.ravel())
        
    scaler = StandardScaler()
    X = scaler.fit_transform(X, y)

    train_data = trainData(torch.FloatTensor(X), 
                           torch.FloatTensor(y))
    
    train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
    
    # Train the PyTorch network
    train_neural_network_pytorch(net, train_loader, optimizer, criterion, NUM_EPOCHS)
    
    return scaler
    
scaler = train_nn()

In [None]:
# save standardScaler
save_model(scaler, filename=SCALER_NAME)
# save nn-model
torch.save(net, MODEL_NAME)

In [None]:
def leader_board_predict_fn(values):
    """
    Function for producing neural network predictions
    """
    # standardscalar for scaling the data (preprocessing)
    scaler = load_model(filename=SCALER_NAME)
    X = scaler.transform(values.values.astype(np.float32))

    net = torch.load(MODEL_NAME, map_location='cpu')
    net.eval()
    
    # Make predictions (class 0 or 1) using the learned parameters
    # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold.
    X = torch.tensor(X)
    logits = net(X)
    # class 0 if < 0.5, class 1 if >= 0.5 and <= 1
    predictions = torch.round(torch.sigmoid(logits))
    
    return predictions.int()

In [None]:
### LEADER BOARD TEST
from sklearn.metrics import roc_auc_score
y_pred = leader_board_predict_fn(X_test)
score_test = roc_auc_score(y_test, y_pred)
print(f"test: Leaderboard Score: {score_test}")
### LEADER BOARD TEST

score_train = roc_auc_score(y_train, leader_board_predict_fn(X_train))
print(f"train: Leaderboard Score: {score_train}")

mean_score = np.mean([score_test, score_train])
print(f"mean: Leaderboard Score: {mean_score}")

In [None]:
from sklearn.metrics import classification_report

# Print classification report. Usefull to see if the minority class is predicted good enough
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import StratifiedKFold
from pandas import DataFrame

def cross_validation():
    """
    Performs a cross validation. Trains the NN, tests its performance and prints the result
    Usefull to check the overall or best performance of a NN model
    """
    X = df.drop(columns = "Class").values
    y = df["Class"].values
    
    kf = StratifiedKFold(n_splits=5)
    scores = []
    for train, test in kf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

        X_train, y_train = RandomOverSampler(random_state=12).fit_resample(X_train, y_train.ravel())

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train, y_train)

        # Initialize the network
        net = Net(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE).to(device)

        # Define the loss criterion and the training algorithm
        criterion = nn.BCEWithLogitsLoss().to(device)  # binary cross entropy
        optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

        train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
        train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

        train_neural_network_pytorch(net, train_loader, optimizer, criterion, num_epochs=NUM_EPOCHS)

        # save standardScaler
        save_model(scaler, filename=SCALER_NAME)
        # save nn-model
        torch.save(net, MODEL_NAME)

        ### LEADER BOARD TEST
        y_pred = leader_board_predict_fn(DataFrame(X_test))
        score_test = roc_auc_score(y_test, y_pred)
        print(f"Leaderboard Score: {score_test}")
        scores.append(score_test)
        print(classification_report(y_test, y_pred))
            
    
    plt.plot(scores)
    plt.show()
# cross_validation()    