# ANZ Synthesised Transaction Dataset

# (A) Import necessary library & dataset

In [None]:
#Data Manupulation

import pandas as pd

#Mathematics

import numpy as np
import math

#Data Visualization

import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('../input/anz-synthesised-transaction-dataset/anz.csv')
df.head(3)

# (B) Data Audit & Data Cleaning

In [None]:
df.shape

In [None]:
df.columns

Here we have more then 12,000 rows and 23 columns. For columns we have:


* status
* card_present_flag
* bpay_biller_code
* account
* currency
* long_lat
* txn_description
* merchant_id
* merchant_code
* first_name
* balance
* date
* gender
* age
* merchant_suburb
* merchant_state
* extraction
* amount
* transaction_id
* country
* customer_id
* merchant_long_lat
* movement

In [None]:
df.describe()
# observation: 4 columns/attribures in numeric value with values of 
# center tendency, spread and 5 figure summary.

In [None]:
# check the average of transaction amount using math
amount_total = df['amount'].sum()
amount_count = df['amount'].count()
transaction_amount_avg = round(amount_total / amount_count,2)
transaction_amount_avg

In [None]:
# check the count of unique value in each attribute
df.nunique()

In [None]:
# check the missing values in the dataset
df.isnull().sum()

# obsevation: 4 attributes have the same count of missing value which is 4326 
# the missing values could be all on the same rows

In [None]:
# check the percentage of missing values in the dataset
round(df.isnull().sum() / df.shape[0] * 100,2)

In [None]:
# count the duplicates in the dataset
df.duplicated().sum()

In [None]:
# count of unique value in movement column
df.movement.value_counts()

In [None]:
df.date.unique()[0]

In [None]:
#change the dtype of date column to datetime
df['date'] = pd.to_datetime(df['date'])
df.date.unique()[0]

In [None]:
df.extraction.unique()[0]

In [None]:
#change the dtype of extraction column to datetime
df['extraction'] = pd.to_datetime(df['extraction'])
df.extraction.unique()[0]

In [None]:
#add three new columns (year, month, day) from the date transactionn 
df['day']= df['date'].dt.day_name()
df['month'] = df['date'].dt.month_name()
df['year'] = df['date'].dt.year
df.columns[-3:]

In [None]:
# check the transaction country
df.country.value_counts()

# observation: all the transaction happened in Australia

In [None]:
# check the transaction country
df.year.value_counts()

# observation: all the transaction happened in 2018

In [None]:
# drop unecessary attributes
df = df.drop(['merchant_code','country','currency','year'], axis = 1)

#final dataset
df.head(1)

# (C) Exploratory Data Analysis

## 1. Transaction made by Gender

In [None]:
sns.countplot(data=df, x='gender')
plt.title('Transaction by Gender')
plt.show()

# observation: number of males is more than females

## 2. Transaction made by Status

In [None]:
sns.set(style="whitegrid")

sns.countplot(data=df,x='status')
plt.title('Transaction Count by Transaction Status')
plt.show()

# observation: there are more authorized transaction than posted

## 3. Transaction made by Transaction Mode

In [None]:
sns.countplot(data=df, x = 'txn_description')
plt.title('Transaction Count by Transaction Mode')
plt.xlabel('Transaction Mode')
plt.xticks(rotation=90)
plt.show()

# observation: most used transaction mode is SALES-POS

## 4. Transaction by States

In [None]:
sns.countplot(data=df,x='merchant_state')
plt.title('Transaction count by Merchants State')
plt.show()

# observation: New South Wales (NSW) and Victoria (VIC) have the most number of transaction

## 5. Transaction by Month & Day

In [None]:
plt.figure(figsize=[15, 5])

plt.subplot(1,2,1)
sns.countplot(data=df, x = 'month')
plt.title('Transactions count by month')

plt.subplot(1,2,2)
sns.countplot(data=df, x = 'day')
plt.title('Transactions count by day of week')
plt.xticks(rotation=90)
plt.show()

# observation: the number of transaction made between the month almost the same. 
# also friday and wednesday have the most transaction  

## 6. Relationship between card present flag and balance

In [None]:
print(df.card_present_flag.unique())

In [None]:
sns.relplot(data=df, x= 'card_present_flag', y='balance', hue='gender')

## 7. Data numeric variables histogram

In [None]:
df.hist(figsize = (12,12))
plt.show()

## 8. Distribution Plot & Boxplot of Transaction Balance 

In [None]:
plt.figure(figsize=[15, 5])

plt.subplot(1,2,1)
sns.distplot(df['balance'])
# Observation: The distribution plot of balance is skewed to the right.

plt.subplot(1,2,2)
sns.boxplot(df['balance'])
# Observation: So many outliers for the balance attribute.

## 9. Distribution Plot & Boxplot of Transaction Amount 

In [None]:
plt.figure(figsize=[15, 5])

plt.subplot(1,2,1)
sns.distplot(df['amount'])

plt.subplot(1,2,2)
sns.boxplot(df['amount'])

# Observation: The outlier effecting the mean, range and standard deviation of the 
# transaction amount. The distribution plot looks long-tailed due to outlier.

In [None]:
df2 = df.describe()[['amount']].reset_index()
df2

In [None]:
# finding higher outlier
q1 = df2.iloc[4]['amount']
q3 = df2.iloc[6]['amount']
iqr = q3-q1
higher_outlier = q3 + (1.5 * iqr)
higher_outlier

In [None]:
# distribution plot / histogram by limiting the x.axis (after know the outlier)
plt.figure(figsize=[10,6])
plt.hist(data=df, x='amount', bins= np.arange(-10,150+1,1))
plt.xlim(-10,150)
plt.show()

# observation: now can see the distribution clearly after axis limit
# need to to this strategy for the balance transaction to better insight

## 10. Distribution Plot & Boxplot of Age 

In [None]:
plt.figure(figsize=[15, 5])

plt.subplot(1,2,1)
sns.distplot(df['age'])

plt.subplot(1,2,2)
sns.boxplot(df['age'])

## 11. Transaction Amount by Gender

In [None]:
gender_amt = df.groupby('gender').mean()[['amount']].reset_index()
gender_amt

In [None]:
sns.catplot(data=gender_amt, kind='bar',x='gender',y='amount')
# this bar chart is pointless.
# because it include the outlier that effecting the average of the transaction amount

In [None]:
# higher outlier for amount transaction is 110.1375
# after limit the amount
sns.catplot(data=df.query('amount <= 110.1375'), x='gender', y='amount', kind='bar')
plt.title('Average Transaction Amount per Gender');
plt.show()

In [None]:
# categorical scatterplot

sns.catplot(data=df.query('amount <= 110.1375'), x='gender', y='amount')
plt.title('Transaction Amount by Gender');
plt.show()

## 12. Transaction amount by Month 

In [None]:
amount = df[df['amount'] <= 110]
month = amount.groupby('month').mean()['amount'].reset_index()
month

In [None]:
sns.catplot(data=month, kind='bar',x='month',y='amount')
plt.title('Average Transaction Amount per Month')
sns.catplot(data=month, kind='point',x='month',y='amount')
sns.catplot(data=df.query('amount <= 110.1375'),x='month',y='amount')
plt.show()

# observation: september has highest transaction amount 

In [None]:
sns.catplot(data=df.query('amount <= 110.1375'), kind='point',x='month',y='amount', hue='gender')
# observation: males have higher transaction amounts 
# september is the month with highest transaction amounts for both gender

In [None]:
## to be continue