# Load libraries and dataset

In [1]:
from imblearn.datasets import make_imbalance
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('C:/Users/Raymnd Diaz/Desktop/Capstone2/data/processed_data.csv').drop(['Unnamed: 0'],axis=1)

# Create Dummy Variables

In [3]:
# one hot encoding type column because it is a categorical variable
type_dummies = pd.get_dummies(df['type'])
df = pd.concat([df,type_dummies],axis=1).drop(['type'],axis=1)

In [4]:
df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,1,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,1,0
2,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,1
3,181.0,181.0,0.0,21182.0,0.0,1,0,1,0,0,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,1,0


# Scaling the data

In [6]:
# Create test and train datasets
X = df.drop(['isFraud'],axis=1)

# Create constants for X, so the model knows its bounds
y = df[['isFraud']]

In [7]:
# map column names
names = X.columns.values

In [8]:
scaler = StandardScaler() # initiate standard scaler

In [9]:
scaled_df = scaler.fit_transform(X)

In [10]:
scaled_df = pd.DataFrame(scaled_df, columns=names)

In [11]:
scaled_df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,-0.28156,-0.22981,-0.237622,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345
1,-0.294767,-0.281359,-0.285812,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345
2,-0.297555,-0.288654,-0.292442,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,-0.714778,3.307478
3,-0.297555,-0.288654,-0.292442,-0.317582,-0.333411,-0.530965,1.357803,-0.08096,-0.714778,-0.302345
4,-0.278532,-0.274329,-0.282221,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345


In [12]:
scaled_df['isFraud'] = y

In [13]:
scaled_df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,isFraud
0,-0.28156,-0.22981,-0.237622,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345,0
1,-0.294767,-0.281359,-0.285812,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345,0
2,-0.297555,-0.288654,-0.292442,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,-0.714778,3.307478,1
3,-0.297555,-0.288654,-0.292442,-0.317582,-0.333411,-0.530965,1.357803,-0.08096,-0.714778,-0.302345,1
4,-0.278532,-0.274329,-0.282221,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345,0


# Split data into training and testing subsets

In [14]:
# Create features for test/train set
features = scaled_df.drop(['isFraud'],axis=1)

# Create labels for test/train set
labels = scaled_df[['isFraud']]

In [15]:
# Split into test/train set
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, test_size=0.25, 
                                                    random_state=42)