In [3]:
# Loading Libraries:
import pandas as pd
import numpy as np
import random
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Loading Data:
fraud_data = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Imbalanced_classes/master/fraud_data.csv")

In [5]:
# Getting Basic idea about the data:
fraud_data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2994681,0,242834,25.0,H,9803,583.0,150.0,visa,226.0,...,firefox 56.0,24.0,1920x1080,match_status:2,T,F,T,T,desktop,rv:56.0
1,3557242,0,15123000,117.0,W,7919,194.0,150.0,mastercard,166.0,...,,,,,,,,,,
2,3327470,0,8378575,73.773,C,12778,500.0,185.0,mastercard,224.0,...,,,,,,,,,,
3,3118781,0,2607840,400.0,R,12316,548.0,150.0,visa,195.0,...,mobile safari generic,32.0,1136x640,match_status:2,T,F,T,F,mobile,iOS Device
4,3459772,0,12226544,31.95,W,9002,453.0,150.0,visa,226.0,...,,,,,,,,,,


In [6]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59054 entries, 0 to 59053
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(385), int64(18), object(31)
memory usage: 195.5+ MB


In [7]:
# Taking a look at the target variable:
fraud_data.isFraud.value_counts()

isFraud
0    57049
1     2005
Name: count, dtype: int64

In [8]:
# 0 means not fraud and 1 means is Fraud:

In [9]:
fraud_data.isFraud.value_counts() / len(fraud_data) * 100  # Get the percentage of unique values in the variable 'isFraud'

isFraud
0    96.604802
1     3.395198
Name: count, dtype: float64

In [10]:
# There are only 3% of the data which are fraud and the rest 97% are not fraud. This is clearly a class imbalance problem. 
# In this notebook we will look to solve this type of problems

In [11]:
# Missing Values:
fraud_data.isnull().sum() / len(fraud_data) * 100    # get the percentage of missing values in each columns

TransactionID      0.000000
isFraud            0.000000
TransactionDT      0.000000
TransactionAmt     0.000000
ProductCD          0.000000
                    ...    
id_36             75.945745
id_37             75.945745
id_38             75.945745
DeviceType        75.979612
DeviceInfo        79.813391
Length: 434, dtype: float64

In [12]:
a = fraud_data.isnull().sum() >0  # TO get the Boolean results for the column with missing data:

In [13]:
a.value_counts()   # Get the count of column with missing values

True     414
False     20
Name: count, dtype: int64

In [14]:
# Out of 434 columns, 414 have some missing values:

In [None]:
# Dealing with Missing Values:
Filling the missing values with right technique can change our results drastically.
Also, there is no fixed rule of filling the missing values.
No method is perfect for filling the missing values. We need to use our common sense, our logic, or may need to 
see what works for that particular data set.
              
Ways of dealing with missing values:
Default value: One can fill the missing value by default value on the basis of one's 1) understanding of variable, 2)
context / data insight or 3) common sense / logic.

Deleting: Suppose in our dataset we have too many missing values in

Column, we can drop the column
Row, drop the row. Usually we do this for a large enough dataset.
                                     
Mean/Median/Mode - Imputation: We fill missing values by mean or median or mode(i.e. maximum occuring value). Generally
we use mean but if there are some outliers, we fill missing values with median.Mode is used to fill missing values for categorical column.

Data Cleaning in Python: the Ultimate Guide

Eliminate columns with more than 20% missing values. Again this is very subjective and solely depends on 
the nature of the dataset and underlying context. We cannot generalize this procedure to all the datasets.

In [15]:
fraud_data = fraud_data[fraud_data.columns[fraud_data.isnull().mean() < 0.2]]   # We will keep those columns which has missing values less than 20%

In [17]:
# Here we will fill missing values of numerical variables (or columns) with mean values:
num_cols = fraud_data.select_dtypes(include=np.number).columns   # Getting the all numerical columns

fraud_data[num_cols] = fraud_data[num_cols].fillna(fraud_data[num_cols].mean())   # Fill the missing values with mean

In [18]:
# Filling missing values of categorical variables with mode: Mode is maximum occuring element in a variable.
cat_cols = fraud_data.select_dtypes(include='object').columns  # Getting all the categorical columns

fraud_data[cat_cols] = fraud_data[cat_cols].fillna(fraud_data[cat_cols].mode).iloc[0]   # FIll the missing values with maximum 
#occuring element in the colunms

In [19]:
# Let's have a look if there still exist any missing values:
fraud_data.isnull().sum()/len(fraud_data) * 100

TransactionID     0.0
isFraud           0.0
TransactionDT     0.0
TransactionAmt    0.0
ProductCD         0.0
                 ... 
V317              0.0
V318              0.0
V319              0.0
V320              0.0
V321              0.0
Length: 182, dtype: float64

In [20]:
# Now We don't have any missing values in any columns

In [21]:
# One Hot Encoding: (Creating Dummies for categorical columns)
# In this strategy, each category value is converted into a new column and assigned a 1 or 0 (notation for true/false) value to the column.
# In Python there is a class 'OneHotEncoder' in 'sklearn.preprocessing' to do this task,
# but here we will use pandas function 'get_dummies()'. This get_dummies() does the same work as done by 'OneHotEncoder' form sklearn.preprocessing.

In [22]:
fraud_data = pd.get_dummies(fraud_data, columns=cat_cols)   # earlier we have collected all the categorical columns in cat_cols

In [23]:
fraud_data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,V316,V317,V318,V319,V320,V321,ProductCD_H,card4_visa,card6_credit,P_emaildomain_yahoo.com
0,2994681,0,242834,25.0,9803,583.0,150.0,226.0,269.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,True,True,True
1,3557242,0,15123000,117.0,7919,194.0,150.0,166.0,181.0,87.0,...,288.0,1707.0,1707.0,0.0,0.0,0.0,True,True,True,True
2,3327470,0,8378575,73.773,12778,500.0,185.0,224.0,284.0,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,True,True,True
3,3118781,0,2607840,400.0,12316,548.0,150.0,195.0,441.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,True,True,True
4,3459772,0,12226544,31.95,9002,453.0,150.0,226.0,264.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,True,True,True,True


In [24]:
# We have created a a lot of dummy variables are created like; P_emaildomain_hotmail.com, P_emaildomain_hotmail.de, etc.

In [25]:
# Separate Input features and output features
X = fraud_data.drop(columns = ['isFraud'])  # Input features
y = fraud_data.isFraud   # Output features

In [26]:
# Standardization/ Normalization:
#Performing standardization/normalization would bring all the variables in a dataset to a common scale so that it could
#further help in implementing various machine learning models 
#(where standardization/normalization is a pre-requisite to apply such models).

In [27]:
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X)
scaled_features = pd.DataFrame(data=scaled_features)
scaled_features.columns = X.columns

In [28]:
# Let' see how the data looks like after scaling
scaled_features.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,C1,...,V316,V317,V318,V319,V320,V321,ProductCD_H,card4_visa,card6_credit,P_emaildomain_yahoo.com
0,-1.688548,-1.544958,-0.468203,-0.02194,1.412632,-0.286861,0.653753,-0.225982,0.077832,-0.099186,...,-0.051649,-0.063047,-0.059636,-0.099385,-0.10873,-0.108863,0.0,0.0,0.0,0.0
1,1.615662,1.681426,-0.07354,-0.406928,-1.078794,-0.286861,-0.804662,-1.144356,0.077832,-0.099186,...,0.067405,0.347441,0.519006,-0.099385,-0.10873,-0.108863,0.0,0.0,0.0,0.0
2,0.266093,0.21907,-0.258976,0.585989,0.881042,2.788641,0.605139,-0.069441,-10.788933,-0.099186,...,-0.051649,-0.063047,-0.059636,-0.099385,-0.10873,-0.108863,0.0,0.0,0.0,0.0
3,-0.959645,-1.032167,1.140478,0.491581,1.188468,-0.286861,-0.099761,1.569022,0.077832,-0.099186,...,-0.051649,-0.063047,-0.059636,-0.099385,-0.10873,-0.108863,0.0,0.0,0.0,0.0
4,1.043171,1.053404,-0.438389,-0.185621,0.580022,-0.286861,0.653753,-0.278162,0.077832,-0.082944,...,-0.051649,-0.063047,-0.059636,-0.099385,-0.10873,-0.108863,0.0,0.0,0.0,0.0


In [29]:
# SPlitting the dataset into train and test set:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
# Dealing with Imbalanced Data:
# Most machine learning algorithms work best when the number of samples in each class are about equal. This is because most algorithms
# are designed to maximize accuracy and reduce error. 
#Again this can't be generalized and we must be very case specific depending on the nature of data and its underlying context.

In [31]:
# Defferent Techniques:
#1) Resampling Techniques - Oversampling Minority Class:
#Oversampling can be defined as adding more copies of the minority class. In other words, we are creating artificial/synthetic
#data of the minority class (or group).
#Oversampling could be a good choice when we don’t have a lot of data to work with.

In [32]:
# Resampling is located under sklearn.utils
from sklearn.utils import resample

In [34]:
# Concatenate training data back together:
train_data= pd.concat([X_train, y_train], axis=1)

In [36]:
# Separate minority and majority class
not_fraud = train_data[train_data.isFraud==0]
fraud=train_data[train_data.isFraud==1]

In [39]:
# Unsample minority; we are oversampling the minority class to match the number of majority class
fraud_unsampled = resample(fraud,
                           replace=True,  # Sample with replacement
                           n_samples = len(not_fraud),  # Match number in majority class
                           random_state=27)  

In [40]:
# Combine majority and unsampled minority:
unsampled = pd.concat([not_fraud, fraud_unsampled])

In [42]:
# Now let's check classes count:
unsampled.isFraud.value_counts()

isFraud
0    39942
1    39942
Name: count, dtype: int64

In [43]:
#We can notice here after resampling we have an equal ratio of data points for each class!

In [44]:
# Resampling Techniques - Undersample Majority Class:
# Undersampling can be defined as removing some observations of the majority class. Undersampling can be a good choice 
# when we have a ton of data -think millions of rows. But a drawback is that we are removing information that may be valuable.
# This could lead to underfitting and poor generalization to the test set.

In [45]:
# Again we are removing the observations of the majority class to match the number of minority class:
# Downsample majority:
not_fraud_downsampled = resample(not_fraud,
                                replace = False, 
                                n_samples = len(fraud),   # Match the majority n
                                random_state = 27)

In [46]:
# COmbine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])

In [47]:
# Let's Check the classes counts:
downsampled.isFraud.value_counts()

isFraud
0    1395
1    1395
Name: count, dtype: int64

In [48]:
# Again, we have an equal ratio of fraud to not fraud data points, but in this case a much smaller quantity of data to train the model on.

In [49]:
# Generate Synthetic Samples: Here we will use imblearn’s SMOTE or Synthetic Minority Oversampling Technique. SMOTE uses a nearest neighbors algorithm 
#to generate new and synthetic data we can use for training our model.

In [52]:
# Import SMOTE:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state= 25,sampling_strategy=1.0)   # again we are equalizing both the classes

In [54]:
# fit the sampling :
X_train, y_train = sm.fit_resample(X_train, y_train)

In [55]:
np.unique(y_train, return_counts=True)   ## Y_train is numpy array, so unique() functions returns the count of all the unique elements in the array

(array([0, 1], dtype=int64), array([39942, 39942], dtype=int64))

In [56]:
# The count of both classes are equal.

In [57]:
# Conclusion
# That's it for this notebook. We learned handling missing values, 
#one hot encoding, standardization / normalization, what is imbalanced class and three techniques to deal with imbalanced classes.