## ANZ Synthesised Transaction Dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Importing libraries

In [None]:
import numpy as np
import pandas as pd 
import plotly.express as px
import warnings
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
%matplotlib inline
pio.templates.default = "plotly_dark"
warnings.filterwarnings('ignore')

# ANZ Synthesised Transaction DatasetÂ¶

In [None]:
df = pd.read_csv('../input/anz-synthesised-transaction-dataset/anz.csv')
df.head()

### Lets check the Data

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum() #'bpay_biller_code' & 'merchant_code' have majority of the values as NULLS so can be easily dropped

In [None]:
df.country.value_counts() # This can be dropped as we are only dealing with one country

In [None]:
df.currency.value_counts() # This can be also dropped as we are only dealing with just one currency

In [None]:
# Drop 'bpay_biller_code' ,Currency , country and 'merchant_code' columns.
df.drop(['bpay_biller_code','merchant_code', 'currency','country'],axis=1,inplace=True)

In [None]:
# Lets check if we have any duplicates records in the dataset
df.duplicated().sum() # NO Duplicates

In [None]:
# Create Age buckets to analyse data as per age group
df['age_group']=pd.cut(df.age,[0,20,30,40,50,60,99999],labels=['<20','20-30','30-40','40-50','50-60','>60'])

In [None]:
# Change datatype of extraction to datetime
df.loc[:,['extraction','date']] = df.loc[:,['extraction','date']].apply(pd.to_datetime, errors='coerce')

In [None]:
df.info() # Datatype of extraction & date has been updated

In [None]:
# Create date helper columns
df['month'] =df['date'].dt.month_name()
df['day'] = df['date'].dt.day_name()
df['hour']= df.extraction.dt.hour
df.head()

In [None]:
# Change datatype of card_present_flag to INT
df.card_present_flag = df.card_present_flag.astype('Int64')
df.head()

### Analyze Categorical variables

In [None]:
cols = ['card_present_flag', 'status', 'txn_description' , 'movement' , 'gender', 'merchant_state']

#Subplot initialization
fig = make_subplots(
                     rows=3, 
                     cols=2,
                     subplot_titles=('card_present_flag', 'status', 'txn_description' , 'movement','gender', 'merchant_state'),
                     horizontal_spacing=0.2,
                     vertical_spacing=0.2 
                   )
# Adding subplots
count=0
for i in range(1,4):
    for j in range(1,3):
        fig.add_trace(go.Bar(x=df[cols[count]].value_counts().index, 
                             y=df[cols[count]].value_counts(),
                             name=cols[count],
                             textposition='auto',
                             text= [str(i) + '%' for i in (df[cols[count]].value_counts(normalize=True)*100).apply(lambda x : round(x)).tolist()],
                            ),
                      row=i,col=j)
        count+=1
fig.update_layout(
                    title=dict(text = "Analyze Categorical variables (Frequency / Percentage)",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                    height = 920,
                    margin=dict(l=80, r=80, t=150, b=80)
                  )
fig.show()

__Observations__

- Most of the transactions (__80.26%__) have been done via cards (credit / Debit Card).
- Almost __64.08%__ transactions were authorized and rest were posted.
- __92.67%__ transactions are of type debit. Rest transactions are credit. 
- Looks like majority of the transactions use "SALES-POS" & "POS' transaction mode.
- __Males__ tend to do more transactions as compared to __females__.
- __NSW , VIC , QLD__ are most busy merchant states.
- __ACT & TAS__ are least busy.

In [None]:
df0_grp=df.groupby(by='txn_description').sum()[['amount']].reset_index()
df0_grp.amount=df0_grp.amount.apply(lambda x : round(x))
fig=px.treemap(df0_grp,
           path=['txn_description'],
           values='amount',
           color = 'amount',
          )

fig.update_layout(
                    title=dict(text = "Total Amount  by Transaction Desciption",x=0.5,y=0.95),
                    margin=dict(l=10, r=10, t=70, b=10),
                  )
fig.data[0].textinfo = 'label+value'
fig.update_traces(marker_coloraxis=None)
fig.show()

__Insights :__

- __Pay/Salary__ is the major contributor of __bank txn amount__ which is expected as salary transaction amount is usually very high as compared to normal __debit__ transactions.

In [None]:
df_grp0=df.groupby(by='merchant_suburb').sum()[['amount']].reset_index()
fig=px.treemap(df_grp0,
           path=['merchant_suburb'],
           values='amount',
           color = 'amount',
          )
fig.update_layout(
                    title=dict(text = "Total Txn Amount by Suburb",x=0.5,y=0.95),
                    margin=dict(l=10, r=10, t=50, b=10),
                    showlegend=False,
                  )
fig.data[0].textinfo = 'label+value'
fig.update_traces(marker_coloraxis=None)
fig.show()

__Observation:__

- __Sydney , Melbourne, South Brisbane , Mascot and Mount Gambier__ are leading contributers of transaction amount.

### Analyzing Debit Transactions

In [None]:
df1 = df[df.movement=='debit'] # Debit Transactions
df1.shape # 11160 debit transactions

In [None]:
cols = ['card_present_flag', 'status', 'txn_description' , 'movement' , 'gender', 'merchant_state']
#Subplot initialization
fig = make_subplots(
                     rows=3, 
                     cols=2,
                     subplot_titles=('card_present_flag', 'status', 'txn_description' , 'movement','gender', 'merchant_state'),
                     horizontal_spacing=0.2,
                     vertical_spacing=0.2 
                   )
# Adding subplots
count=0
for i in range(1,4): 
    for j in range(1,3): 
        fig.add_trace(go.Bar(x=df1.groupby(by=cols[count]).sum()['amount'].index,
                             y=df1.groupby(by=cols[count]).sum()['amount'].values.round(2),
                             name=cols[count],
                             textposition='auto',
                             text=[str(round((i/sum(df1.groupby(by=cols[count]).sum()['amount'].values))*100))+'%' 
                                   for i in df1.groupby(by=cols[count]).sum()['amount'].values]
                            ),
                      row=i,col=j)
        count+=1
fig.update_layout(
                    title=dict(text = "Analyze Categorical variables (Total Txn Amount/Percentage)",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                    height = 920,
                    margin=dict(l=80, r=80, t=150, b=80)
                  )
fig.show()

__Obervation:__   
- Around __80%__ amount transacted via cards.
-   __Payment__ mode of transaction contributes most to the __txn amount__.
-   __NSW & VIC__ merchant states contributed more than half to overall transaction amount

In [None]:
df_grp=df1.groupby(by=['merchant_state','gender']).sum()[['amount']].reset_index()
order = df1.groupby(by=['merchant_state']).sum()[['amount']].sort_values(by='amount',ascending=False).index
df_grp['merchant_state']=pd.Categorical(df_grp['merchant_state'],order)
df_grp= df_grp.groupby(by=['merchant_state','gender']).sum().reset_index()
fig=px.bar(data_frame=df_grp,
       x='merchant_state',
       y='amount',color='gender',
       barmode='group',
       text=df_grp.amount.apply(lambda x : str(round(x/1000,2))+'k')
      )
fig.update_traces(textposition='outside')
fig.update_xaxes(title='Merchant State') 
fig.update_yaxes(title='Transaction Amount')
fig.update_layout(
                    title=dict(text = "Transaction Amount in Merchant State by Gender",x=0.5,y=0.95),
                    title_font_size=20,
                  )
fig.show()

__Insights__ : Overall males carry out more transactions as compared to females but in three states __(QLD,WA,SA)__ females are leading. 

In [None]:
fig= px.bar(data_frame=df,
       x=df1['day'].value_counts().index.tolist(), 
       y=df1['day'].value_counts().tolist(),
       color=df1['day'].value_counts().tolist(), 
       text=df1['day'].value_counts().tolist() 
      )
fig.update_traces(textposition='outside',marker_coloraxis=None)
fig.update_xaxes(title='Day') 
fig.update_yaxes(title='Transaction count')
fig.update_layout(
                    title=dict(text = "Transaction flow by each day",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                    height = 450,
                  )
fig.show()


fig1= px.bar(data_frame=df1.groupby(by='day').sum()[['amount']].sort_values('amount',ascending=False),
            text=df1.groupby(by='day').sum()[['amount']].sort_values('amount',ascending=False)['amount'].apply(lambda x : str(round(x/1000,2))+'k')
      )
fig1.update_traces(textposition='outside')
fig1.update_xaxes(title='Day') 
fig1.update_yaxes(title='Transaction Amount')
fig1.update_layout(
                    title=dict(text = "Transaction amount by each day",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                    height = 450,
                  )
fig1.show()

__Observation:__
- The transaction count is lower during the start of the week but start to pick up on wednesday through saturday.
- Even though transaction count is comparatively less on __satuday__ but it is still at __place 2__ in terms of transaction amount which signifies bigger transactions on __Saturday__.

In [None]:
df1_grp=df1.groupby(by=['day','gender']).sum()[['amount']].reset_index()
order = ['Monday','Tuesday', 'Wednesday','Thursday','Friday','Saturday','Sunday']
df1_grp['day']=pd.Categorical(df1_grp['day'],order) 
df1_grp= df1_grp.groupby(by=['day','gender']).sum().reset_index()
fig=px.bar(data_frame=df1_grp,
       x='day',
       y='amount',color='gender',
       barmode='group',
       text=df1_grp.amount.apply(lambda x : str(round(x/1000,2))+'k')
      )
fig.update_traces(textposition='outside')
fig.update_xaxes(title='Day') 
fig.update_yaxes(title='Transaction Amount')
fig.update_layout(
                    title=dict(text = "Transaction Amount per day by Gender",x=0.5,y=0.95),
                    title_font_size=20,
                  )
fig.show()

__Insights__

- Males spent most on __Saturday__ which may be due to dinner date,family dinner or some other weekend plans.
- Females are spending most on __Wednesday & Friday__.

In [None]:
fig= px.bar(data_frame=df,
       x=df['month'].value_counts().index.tolist(),
       y=df['month'].value_counts().tolist(),
       color=df['month'].value_counts().tolist(),
       text=df['month'].value_counts().tolist()
      )
fig.update_traces(textposition='outside')
fig.update_xaxes(title='Month') 
fig.update_yaxes(title='Transaction Count')
fig.update_layout(
                    title=dict(text = "Transaction flow by each month",x=0.5,y=0.95),
                    title_font_size=20,
                    width = 700,
                    height = 450,
                  )
fig.show()

- __Insights__ : As per the above bar graph there is a steady increase in the number of transaction by each passing Month which is a good sign

In [None]:
fig=px.bar(df.groupby(by='customer_id').sum()['amount'].sort_values(ascending=False).head(10),
       color=df.groupby(by='customer_id').sum()['amount'].sort_values(ascending=False).head(10),
       text=df.groupby(by='customer_id').sum()['amount'].sort_values(ascending=False).head(10).round(),
      )
fig.update_traces(textposition='outside',marker_coloraxis=None)
fig.update_xaxes(title='Customer ID') 
fig.update_yaxes(title='Transaction Amount')
fig.update_layout(
                    title=dict(text = "Top 10 customers by Transaction Amount",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                    height = 500,
                  )
fig.show()

In [None]:
fig=px.bar(df1.age_group.value_counts(),
       color=df1.age_group.value_counts(),
       text=df1.age_group.value_counts().tolist(),
      )
fig.update_traces(textposition='outside',marker_coloraxis=None)
fig.update_xaxes(title='Age Group') 
fig.update_yaxes(title='Transaction Count')
fig.update_layout(
                    title=dict(text = "Transactions by Age Group",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                    height = 450,
                  )
fig.show()

__Observation__
- Most transactions have been been carried out by Age Groups - __"20-30" & "30-40".__
- Company should think of providing some attractive offers for __"50-60" & ">60"__ age groups considering the transaction volume of these groups.

In [None]:
df2_grp=df1.groupby(by=['age_group','gender']).sum()['amount'].reset_index()
fig=px.bar(data_frame=df2_grp,
       x = 'age_group',
       y = 'amount',
       color='gender',
       barmode='group',
       text=df2_grp.amount.apply(lambda x : str(round(x/1000,2))+'k')
      )
fig.update_traces(textposition='outside')
fig.update_layout(
                    title=dict(text = "Transaction Amount by Age Group & Gender",x=0.5,y=0.95),
                    title_font_size=20,
                  )
fig.show()

__Observation:__

- Males in the age group of __20-30__ are contributing most to the Total Txn amount.
- In Age group __'<20'__, Females are ahead of males in terms of Total txn amount

In [None]:
df3_grp=df1.groupby(by='date').mean()[['amount']].merge(df1.groupby(by='date').count()[['transaction_id']],on='date')
df3_grp.columns= ['Amount','Transaction Count']
fig=px.line(df3_grp)
fig.update_xaxes(title='Date') 
fig.update_layout(
                    title=dict(text = "Average Amount VS Txn Count over time",x=0.5,y=0.95),
                    title_font_size=20
                  )
fig.show()

__Observation:__
- The average transaction amount on __7th August & Oct 21st__  was very high approx __100 AUD__.

- Large number of transactions took place on __17th August & 28th September.__

In [None]:
fig=px.line(df1.groupby(by='date').sum()[['amount']])
fig.update_traces(line=dict(color="#8cba51", width=3.5))
fig.update_xaxes(title='Date') 
fig.update_yaxes(title='Transaction Amount')
fig.update_layout(
                    title=dict(text = "Total Txn Amount over time",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                  )
fig.show()

__Insights__: Total Transaction amount almost touched __14k AUD__ on __21st Oct__. Looks like some big transaction were done on that day as the transaction count is not that high on 21st Oct.

In [None]:
fig=px.line(df1.groupby(by='hour').sum()[['amount']],
            text=df1.groupby(by='hour').sum()['amount'].apply(lambda x : str(round(x/1000))+'k').values
            )
fig.update_traces(line=dict(color="#f58634", width=5))
fig.update_xaxes(title='Hour') 
fig.update_yaxes(title='Transaction Amount')
fig.update_layout(
                    title=dict(text = "Total Txn Amount hourly",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                  )
fig.update_traces(textposition='middle right',fillcolor='red')
fig.show()

__Insights__: 

- Total transaction amount generated at __9:00 AM - 9:59 AM__ is approx __47k__ which is highest throughout the day.
- Between __12:00 AM - 7:00 AM__ we have least transaction amount because of off hours.

In [None]:
df4_grp= df1.groupby(by=['hour','month','gender']).agg(['count','sum'])[['amount']].reset_index()
df4_grp.columns = ['hour', 'month' ,'gender','Transaction Count', 'Total Txn Amount']
fig1=px.line(data_frame=df4_grp,
            x=df4_grp.hour,
            y=df4_grp['Transaction Count'],
            color=df4_grp.gender,
            facet_col= df4_grp.month
           )
fig1.update_xaxes(title='Hour') 
fig1.update_layout(
                    title=dict(text = "Hourly Transaction count by Month ",x=0.5,y=0.95),
                    title_font_size=20,
                    margin=dict(l=80, r=80, t=100, b=80)
                  )
fig1.show()


fig2=px.line(data_frame=df4_grp,
            x=df4_grp.hour,
            y=df4_grp['Total Txn Amount'],
            color=df4_grp.gender,
            facet_col= df4_grp.month
           )
fig2.update_xaxes(title='Hour') 
fig2.update_layout(
                    title=dict(text = "Hourly Transaction Amount by Month ",x=0.5,y=0.95),
                    title_font_size=20,
                    margin=dict(l=80, r=80, t=100, b=80)
                  )
fig2.show()

__Insights:__

- In the month of __September & October__ even though transaction count by females are more at __9:00 AM__ but __TXN amount__ is still less. Seems like comparatively small transactions done by females during the start of the day.

- In __October__ at __2:00 PM__ transaction amount by __females__ is almost double as compared to males.

In [None]:
df4_grp= df1.groupby(by=['hour','day','gender']).agg(['count','sum'])[['amount']].reset_index()
df4_grp.columns = ['hour', 'day' ,'gender','Transaction Count', 'Total Txn Amount']
fig1=px.line(data_frame=df4_grp,
            x=df4_grp.hour,
            y=df4_grp['Transaction Count'],
            color=df4_grp.gender,
            facet_col= df4_grp.day
           )
fig1.update_xaxes(title='Hour') 
fig1.update_layout(
                    title=dict(text = "Hourly Transaction count by Day ",x=0.5,y=0.95),
                    title_font_size=20,
                    margin=dict(l=80, r=80, t=100, b=80)
                  )
fig1.show()


fig2=px.line(data_frame=df4_grp,
            x=df4_grp.hour,
            y=df4_grp['Total Txn Amount'],
            color=df4_grp.gender,
            facet_col= df4_grp.day
           )
fig2.update_xaxes(title='Hour') 
fig2.update_layout(
                    title=dict(text = "Hourly Transaction Amount by Day ",x=0.5,y=0.95),
                    title_font_size=20,
                    margin=dict(l=80, r=80, t=100, b=80)
                  )
fig2.show()

__Insights:__

- On __Saturday__ between __2:00 PM - 3:00 PM__ transaction amount by __males__ is almost __6 times__ higher than __females__. However on __Sunday__ at the same time the trend is completely in the opposite direction. 

### Analysing Credit Transactions

In [None]:
df2 = df[df.movement=='credit']
df2.shape # 883 Credit transactions

In [None]:
fig=px.bar(
            df2.groupby(by='customer_id').mean()['balance'].sort_values(ascending=False).head(10),
            text = df2.groupby(by='customer_id').mean()['balance'].sort_values(ascending=False).head(10).apply(
                lambda x : str(round(x/1000,2))+'k' ),
            color = df2.groupby(by='customer_id').mean()['balance'].sort_values(ascending=False).head(10)
          )
fig.update_traces(textposition='outside')
fig.update_layout(
                    title=dict(text = "Top Valuable Customers by AVG Balance",x=0.5,y=0.95),
                    title_font_size=20,
                    showlegend=False,
                    height = 500
                    
                  )
fig.update_traces(marker_coloraxis=None)
fig.show()

In [None]:
order = ['August','September','October']
df['month']=pd.Categorical(df['month'],order)
g1 = df.groupby(by='month').agg(['mean','sum'])['amount']
g1.columns=['Avg Amount', 'Total Amount']
g1[['Avg Amount','Total Amount']]=g1[['Avg Amount','Total Amount']].round().astype(int)
g1.reset_index(inplace=True)

g2=df.groupby(by='month').agg(['mean','sum'])['balance']
g2.columns=['Avg Balance', 'Total Balance']
g2[['Avg Balance','Total Balance']]=g2[['Avg Balance','Total Balance']].round().astype(int)
g2.reset_index(inplace=True)

month = g1.merge(g2,on='month')
month

In [None]:
pio.templates.default = "plotly_white"
fig = ff.create_table(month) 
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i].font.size = 13
fig.show()

__Insights:__

- There is a 7% increase in Avg transaction amount from August to October.
- 71% increase in AVG balance maintained by the customers.
- 77% increase in total balance over these 3 months.

<!DOCTYPE html>
<html>
<body>
<div align="center">
<h3>Prepared by Asif Bhat</h3>

<h3>Follow Me on - <a href="https://www.linkedin.com/in/asif-bhat/">LinkedIn</a>&nbsp; <a href="https://mobile.twitter.com/_asifbhat_">Twitter</a>&nbsp; <a href="https://www.instagram.com/datasciencescoop/?hl=en">Instagram</a>&nbsp; <a href="https://www.facebook.com/datasciencescoop/">Facebook</a></h3>
</div>

</div>
</body>
</html>