In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## This Project is focusing on correcting the imbalance in dataset 

**Dataset information**

The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

So my aim is to balance the dataset to get good classification between fraud and normal transaction with any classification algorithm



## 1. Importing Libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 2.Creating Dataframe with Creditcard dataset

In [None]:
df = pd.read_csv('../input/credit-card-fraud-detection/creditcard.csv')

In [None]:
df.head()

## 3. Observing the imbalalnce in dataset 

**3.1 Checking the imbalalnce with respect to target Feature "Class"**

In [None]:
classification=df.Class.value_counts(sort= True)
classification

It indicates we very less data for Fraud cases 

**3.2 Visualizing data imbalance using the bar chart**

In [None]:
LABELS=('Normal','Fraud')
plt.figure(figsize=(10,5))
classification.plot(kind = 'bar',rot=0)
plt.xlabel('Class')
plt.ylabel('Number of observations')
plt.xticks(range(2), LABELS)
plt.title('Transaction class Distribution')
plt.show


## 4. Correcting the Imbalance in Dataset using Oversampling method

**4.1 Classifying Dependent and Independent variables**

In [None]:
y=df['Class']
X=df.drop('Class',axis=1)

In [None]:
y.value_counts()

**4.2 Importing and fitting the SMOTETomek for oversampling on X and y**

In [None]:
from imblearn.combine import SMOTETomek
smk = SMOTETomek()

In [None]:
X_res,y_res = smk.fit_sample(X,y)

**4.3 Observing the result of oversampling**

In [None]:
oversampling=y_res.value_counts()
oversampling

We can see that data points for the Fraud transactions(Class==1) has been increased

**4.4 Visualizing the result of oversampling**

In [None]:
LABELS=('Normal','Fraud')
plt.figure(figsize=(10,5))
oversampling.plot(kind = 'bar',rot=0)
plt.xlabel('Class')
plt.ylabel('Number of observations')
plt.xticks(range(2), LABELS)
plt.title('Oversampled Transaction class Distribution')
plt.show

## 5. Correcting the Imbalance in Dataset using Undersampling method

**5.1 Importing and fitting the Nearmiss for Undersampling on X and y**

In [None]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()

In [None]:
X_ndr,y_ndr = nm.fit_sample(X,y)

**5.2 Observing the result of undersampling**

In [None]:
undersampling=y_ndr.value_counts()
undersampling

We can see that data points for the Normal transactions(Class==0) has been decreased

**5.3 Visualizing the result of undersampling**

In [None]:
LABELS=('Normal','Fraud')
plt.figure(figsize=(10,5))
undersampling.plot(kind = 'bar',rot=0)
plt.xlabel('Class')
plt.ylabel('Number of observations')
plt.xticks(range(2), LABELS)
plt.title('Undersampled Transaction class Distribution')
plt.show