In [None]:
#MACHINE LEARNING - HANDLING IMBALANCED DATASETS
#source: https://www.youtube.com/watch?v=YMPMZmlH5Bo&list=PLZoTAELRMXVPkl7oRvzyNnyj1HS4wt2K-&index=50

import pandas as pd
import numpy as np
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab.svm import OneClassSVM
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ['Normal', 'Fraud']

data = pd.read_csv('creditcard.csv', sep = ',')

#Create independent features
columns = data.columns.tolist()
#Filter the columns of the data we don't want
columns = [c for c in columns if c not in ['Class']]
#Store the variable we are predicting
target = 'Class'
#Define a random state
state = np.random.RandomState(42)
X = data[columns]
y = data[target]
X_outliers = state.uniform(low = 0, size = (X.shape[0], X.shape[1]))
#Print shapes of X and y
print(X.shape)
print(y.shape)

#Exploratory data analysis
data.isnull().values.any()

#Check visually how the data is distributed
count_classes = pd.value_counts(data['Class'], sort = True)
count_classes.plot(kind = 'bar', rot = 0)
plt.title('Transaction Class Distribution')
plt.xticks(range(2), LABELS)
plt.xlabel('Class')
plt.ylabel('Frequency')

#Get the Fraud and the "normal" dataset
fraud = data[data['Class'] == 1]
normal = data[data['Class'] == 0]

print(fraud.shape, normal.shape)

#Creating the under sampling data 
from imblearn.under_sampling import NearMiss
nm = NearMiss(random_state = 42)
X_res, y_res = nm.fit_sample(X, y)

print(X_res.shape, y_res.shape)

from collections import Counter
print(f'Original dataset shape {Counter(y)}')
print(f'Resampled dataset shape {Counter(y_res)})
