<div class="dataset-header-v2__top-image-container">
    <img src="https://storage.googleapis.com/kaggle-datasets-images/312305/633246/752964d08f6001573444649668b0b011/dataset-cover.jpg?t=2019-08-22-03-58-44" class="Header_CoverImg-sc-1431b7d ibFJYv">
</div>

In [None]:
import numpy as np
import pandas as pd

from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Conv1D
from tensorflow.keras.layers import MaxPooling1D, Dense

import warnings
warnings.filterwarnings('ignore')

from pylab import rcParams
rcParams['figure.figsize'] = 12, 6

<h1 id="dataset" style="color:#a97828; background:#4dc5ea;"> 
    <center>Dataset
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

In [None]:
df = pd.read_csv('../input/predict-chargeback-frauds-payment/df.csv', index_col=0)
df = shuffle(df)
df.head()

In [None]:
def normalize(df):
    return (df - df.mean()) / df.std()

In [None]:
df = df[['Card Number', 'Date', 'Amount', 'CBK',]]

card_numbers_to_idx = { v:k for k,v in enumerate(np.unique(df['Card Number'].values))}
df['Card Number'].replace(card_numbers_to_idx, inplace=True)
df['Card Number'] = normalize(df['Card Number'])

df['Date'] = pd.to_datetime(df['Date']).astype(int)
df['Date'] = normalize(df['Date'])

df['Amount'] = normalize(df['Amount'])

df.replace({'No': 0, 'Yes': 1}, inplace=True)
data = df.values

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols = list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    # put it all together
    agg = concat(cols, axis=1)
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values

In [None]:
data = series_to_supervised(data, n_in=6)

<h1 id="models" style="color:#a97828; background:#4dc5ea;"> 
    <center>Models
        <a class="anchor-link" href="#models" target="_self">¶</a>
    </center>
</h1>

In [None]:
class RandomForest():
    def __init__(self):
        self.n_estimators = 500
        self.model = RandomForestRegressor(n_estimators=self.n_estimators)
        
    def train(self, train):
        train = asarray(train)
        X_train, y_train = train[:, :-1], train[:, -1]
        self.model.set_params(n_estimators=self.n_estimators)
        self.model.fit(X_train, y_train)
        self.n_estimators += 500
    
    def predict(self, test):
        yhat = self.model.predict([test])
        return yhat[0]

In [None]:
class Xgboost():
    def __init__(self):
        self.n_estimators = 500
        self.model = XGBRegressor(objective='reg:squarederror',
                                  n_estimators=500)
        
    def train(self, train):
        train = asarray(train)
        X_train, y_train = train[:, :-1], train[:, -1]
        self.model.set_params(n_estimators=self.n_estimators)
        self.model.fit(X_train, y_train)
        self.n_estimators += 500
        
    def predict(self, test):
        y_hat = self.model.predict(asarray([test]))
        return y_hat[0]

In [None]:
class CNN():
    def __init__(self):
        self.n_in, self.n_out = 27, 1
        self.model = Sequential()
        self.model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
        self.model.add(MaxPooling1D(pool_size=2))
        self.model.add(Flatten())
        self.model.add(Dense(50, activation='relu'))
        self.model.add(Dense(self.n_out))
        self.model.compile(optimizer='adam', loss='mse')
        
    def train(self, train):
        train = asarray(train)
    
        X_train, y_train = train[:, :-1], train[:, -1]

        n_features = X_train.shape[1]
        X_train = X_train.reshape(len(X_train), self.n_in, 1)
        
        self.model.fit(X_train, y_train, epochs=500, verbose=0)
        
    def predict(self, test):
        y_hat = self.model.predict(test.reshape(1, len(test), 1))
        return y_hat[0][0]

In [None]:
random_forest = RandomForest()
xgboost = Xgboost()
cnn = CNN()

In [None]:
def get_models():
    models = []
    models.append(('random_forest', random_forest))
    models.append(('xgboost', xgboost))
    models.append(('cnn', cnn))
    return models

<h1 id="training" style="color:#a97828; background:#4dc5ea;"> 
    <center>Training
        <a class="anchor-link" href="#training" target="_self">¶</a>
    </center>
</h1>

In [None]:
def train_test_split(data, n_test):
    return data[:-n_test, :], data[-n_test:, :]

def training(data, n_test, n_models):
    preds = [list() for a in range(n_models)]
    train, test = train_test_split(data, n_test)

    history = [x for x in train]
    for i in range(len(test)):
        X_test, y_test = test[i, :-1], test[i, -1]
        models = get_models()
        for j, (name, model) in enumerate(models):
            model.train(history)
            y_hat = model.predict(X_test)
            preds[j].append(y_hat)
            if(i % 2 == 0):
                print('i:{:3d}, Model:{:13s}, Expected:{:.1f}, Predicted:{:.1f}'
                      .format(i, name, y_test, y_hat))
            
        history.append(test[i])
    
    errors = [list() for a in range(n_models)]
    for i, error in enumerate(errors):
        errors[i] = mean_absolute_error(test[:, -1], preds[i])
    return errors, test[:, -1], preds

In [None]:
epochs = 20
mae, y, y_hat = training(data, epochs, 3)

<h1 id="analysis" style="color:#a97828; background:#4dc5ea;"> 
    <center>Analysis
        <a class="anchor-link" href="#analysis" target="_self">¶</a>
    </center>
</h1>

In [None]:
print('Random Forest MAE: %.3f' % mae[0])
print('XGBoost MAE: %.3f' % mae[1])
print('CNN MAE: %.3f' % mae[2])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 5))
axes[0].set_title("Random Forest")
axes[0].plot(y, label='Expected')
axes[0].plot(y_hat[0], label='Predicted')
axes[0].legend()
axes[1].set_title("XGBoost")
axes[1].plot(y, label='Expected')
axes[1].plot(y_hat[1], label='Predicted')
axes[1].legend()
axes[2].set_title("CNN")
axes[2].plot(y, label='Expected')
axes[2].plot(y_hat[2], label='Predicted')
axes[2].legend()
#fig.tight_layout()

<h1 id="predict" style="color:#a97828; background:#4dc5ea;"> 
    <center>Predict
        <a class="anchor-link" href="#predict" target="_self">¶</a>
    </center>
</h1>

In [None]:
def predict(models, data, nr_valid):
    train, test = train_test_split(data, nr_valid)
    for i in range(len(test)):
        X_test, y_test = test[i, :-1], test[i, -1]
        for j, (name, model) in enumerate(models):
            pred = model.predict(X_test)
            pred = 1 if pred > 0.5 else 0
            print('{:1d}) Model Name:{:15s}, Predicted:{:1.3f} - Expected:{:1.3f}'
                     .format(i+1, name, pred, y_test))

In [None]:
df_valid = df.iloc[:12].copy()
valid_data = series_to_supervised(df_valid.values, n_in=6)
df_valid

In [None]:
models = get_models()
predict(models, valid_data, 6)