In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import geopandas as gpd
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import math

# Input data files are available in the read-only '../input/' directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using 'Save & Run All' 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Reading Data
data = pd.read_csv('../input/anz-synthesised-transaction-dataset/anz.csv')

In [None]:
# Number of Raws and columns
data.shape

In [None]:
data.head()

# Data Cleaning

In [None]:
# number of unique items in each colum

col = data.columns
for c in col:
    print(data[c].nunique(), "unique items in",c)

In [None]:
#checking null columns

data.isnull().sum()

In [None]:
#checking if any column has 0 value in it 

col = data.columns
for c in col:
    print((data[c] == 0).sum(), ':', c)

In [None]:
# Making separate columns for longitude and latitude 

data['longitude'] = data['long_lat'].str.split(' ', expand=True)[0]
data['latitude'] = data['long_lat'].str.split(' ', expand=True)[1]
# print(data['latitude'])

In [None]:
#checking data type of the columns

col = data.columns
for c in col:
    print(data[c].dtype, ':', c)

In [None]:
#changing data type of few columns

data.latitude = data['latitude'].astype(float)
data.longitude = data['longitude'].astype(float)
data['date'] = pd.to_datetime(data['date'], infer_datetime_format=True)

In [None]:
# Removing columns which gives no valuable analysis

empty = ['merchant_id', 'bpay_biller_code', 'merchant_code', 'long_lat',  'transaction_id']
data.drop(empty, axis = 1, inplace = True)

# Univariate and Bivariate Analysis

In [None]:
data.head(3)
col = ['status', 'card_present_flag', 'txn_description', 'first_name', 'gender', 'merchant_state', 'movement', 'age']

for c in col:
    x_axis = data[c].value_counts().index
    y_axis = data[c].value_counts().values
    plt.figure(figsize=(13,6))
    sns.barplot(x=x_axis, y=y_axis)
    plt.xticks(rotation=90) 
    plt.title(['Graphical Representation of column', c],fontsize = 30,color='black')
    plt.xlabel('Date', fontsize = 20)
    plt.ylabel('Frequency', fontsize =20)
    plt.show()

## Salary & Spending Analysis

In [None]:
# Salary vs Spending

total_salary = data.amount[(data.movement == "credit")].sum() 
total_spending = data.amount[(data.movement == "debit")].sum()
print("Total Salary:",total_salary)
print("Total Spending:", total_spending)
pie = [total_salary, total_spending]
fig, ax = plt.subplots(figsize=(12,8))
label = ['Total Salary',
        'Total Spending']
explode = (0.1, 0)
ax.pie(pie, autopct="%.1f%%", labels =  label, explode = explode,startangle=90 )
ax.legend(frameon=True, bbox_to_anchor=(1,0.9))
plt.show()

In [None]:
total_salary_male = data.amount[(data.movement == "credit") & (data.gender == 'M')].sum()
total_spending_male = data.amount[(data.movement == "debit") & (data.gender == 'M')].sum()
print('Total Male Salary:', total_salary_male)
print('Total Male Spending:',total_spending_male)
pie_male = [total_salary_male, total_spending_male]

plt.figure(figsize=(12,8))
label = ['Total Male Salary',
        'Total Male Spending']
explode = (0.1, 0)
plt.pie(pie_male, autopct="%.1f%%", labels =  label, explode = explode,startangle=90 )
plt.legend(frameon=True, bbox_to_anchor=(1,0.9))
plt.show()

In [None]:
total_salary_female = data.amount[(data.movement == "credit") & (data.gender == 'F')].sum()
total_spending_female = data.amount[(data.movement == "debit") & (data.gender == 'F')].sum()
print('Total Female Salary:',total_salary_female)
print('Total Female Spending:',total_spending_female)
pie_female = [total_salary_female, total_spending_female]

plt.figure(figsize=(12,8))
label = ['Total Female Salary',
        'Total Female Spending']
explode = (0.1, 0)
plt.pie(pie_female, autopct="%.1f%%", labels =  label, explode = explode,startangle=90 )
plt.legend(frameon=True, bbox_to_anchor=(1,0.9))
plt.show()

In [None]:
# Making separately column for days and months
data['day'] = data.date.dt.dayofweek
data['day'].replace({0: '1_Monday',1: '2_Tuesday',2: '3_Wednesday',3: '4_Thursday',4: '5_Friday',5: '6_Saturday',6: '7_Sunday'}, inplace = True)
data['month'] = data.date.dt.month

# Total Spending by days
salary_data = data[data.movement == "credit"] # Salary Data
spending_data = data[data.movement == "debit"] # spending Data

# Total Daily Expenses
daily_amount_spent = spending_data.groupby('day')['amount'].sum().reset_index()
daily_amount_spent.sort_values(by = 'day')

fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(x = daily_amount_spent.day, y = daily_amount_spent.amount, ax= ax)
plt.title('Total Expenses Day-Wise ',fontsize = 30,color='black')
plt.xlabel('Days', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()


In [None]:
# Average Daily Expenses
daily_amount_spent = spending_data.groupby('day')['amount'].mean().reset_index()
daily_amount_spent.sort_values(by = 'day')

fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(x = daily_amount_spent.day, y = daily_amount_spent.amount, ax= ax)
plt.title('Average Expenditure Day-Wise',fontsize = 30,color='black')
plt.xlabel('Days', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
daily_amount_spent_line = spending_data.groupby(['day', 'date'])['amount'].mean().reset_index()

# daily_amount_spent.sort_values(by = 'day')
fig, ax = plt.subplots(figsize=(15,8))
sns.lineplot(x = daily_amount_spent_line.date, y = daily_amount_spent_line.amount, hue = daily_amount_spent_line.day, ax= ax)
plt.title('3 Months Average Expenses',fontsize = 30,color='black')
plt.xlabel('Period', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.show()

In [None]:

spending_data

monthly_amount_spent = spending_data.groupby(['month'])['amount'].sum().reset_index()
monthly_amount_spent.sort_values(by = 'month')
fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(x = monthly_amount_spent.month, y = monthly_amount_spent.amount, ax= ax)
plt.title('Total Expenses Month-wise',fontsize = 30,color='black')
plt.xlabel('August, September, October', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()


In [None]:
monthly_amount_spent = salary_data.groupby(['month'])['amount'].sum().reset_index()
monthly_amount_spent.sort_values(by = 'month')
fig, ax = plt.subplots(figsize=(15,5))
sns.barplot(x = monthly_amount_spent.month, y = monthly_amount_spent.amount, ax= ax)
plt.title('Total Expenses Month-wise',fontsize = 30,color='black')
plt.xlabel('August, September, October', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Visual representation of total daily expenditure of 100 customers

daily_amount_spent = spending_data.groupby('date')['amount'].sum()

plt.figure(figsize=(15,10))
sns.lineplot(x = daily_amount_spent.index, y = daily_amount_spent.values)
plt.title('Total daily expenditure of 100 customers',fontsize = 30,color='black')
plt.xlabel('Date', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Visual representation of average daily expenditure of 100 customers

daily_amount_spent = spending_data.groupby('date')['amount'].mean()

plt.figure(figsize=(15,10))
sns.lineplot(x = daily_amount_spent.index, y = daily_amount_spent.values)
plt.title('Average Daily Expenditure of Males & Females 100 Customers',fontsize = 30,color='black')
plt.xlabel('Date', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Visual representation of total daily expenditure of 100 male and females customers

daily_amount_spent_gender = spending_data.groupby(['date','gender'], as_index = False)['amount'].sum()
daily_amount_spent_gender

plt.figure(figsize=(15,5))
sns.lineplot(x = 'date', y = 'amount', hue = 'gender', data = daily_amount_spent_gender)
plt.title('Total daily expenditure of 100 males and female customers',fontsize = 30,color='black')
plt.xlabel('Date', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Visual representation of average daily expenditure of 100 male and females customers

daily_amount_spent_gender = spending_data.groupby(['date','gender'], as_index = False)['amount'].mean()
palette = sns.color_palette('rocket_r', 2)
plt.figure(figsize=(15,5))
sns.lineplot(x = 'date', y = 'amount', hue = 'gender', data = daily_amount_spent_gender, palette = palette  )
plt.title('Average daily expenditure of 100 males and female customers',fontsize = 30,color='black')
plt.xlabel('Date', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Visual representation of total amount includes salary and expenses

daily_amount= data.groupby(['date','gender'], as_index = False)['amount'].sum()
palette = sns.color_palette('rocket_r', 2)
plt.figure(figsize=(15,5))
sns.lineplot(x = 'date', y = 'amount', hue = 'gender', data = daily_amount, palette = palette  )
plt.title('Average amount of all customers including salary and expenses',fontsize = 30,color='black')
plt.xlabel('Date', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
# visual Representation of daily total expenses of 100 customers in each state
data_M = spending_data[data.gender == 'M']
data_F = spending_data[data.gender == 'F']
daily_amount_spent_M = data_M.groupby(['merchant_state'])['amount'].sum().reset_index()
daily_amount_spent_F = data_F.groupby(['merchant_state'])['amount'].sum().reset_index()
print(len(daily_amount_spent_M))
# print(daily_amount_spent_F)
width = 0.25
x = np.arange(len(daily_amount_spent_M.merchant_state))
fig, ax = plt.subplots(figsize=(15,5))
ax.bar( x + 0.15, 'amount', data = daily_amount_spent_M, width=width, label = 'Male', color = 'wheat')
ax.bar( x - 0.15,  'amount', data = daily_amount_spent_F, width=width,  label = 'Female', color = 'orange')
plt.title('Total expenses of 100 customers in each state',fontsize = 30,color='black')
plt.xticks(np.arange(8), ['ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA'])
plt.xlabel('States', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.legend()
plt.show()

In [None]:
# Visual Representation Of Customer Expenses In Each State
data_M_spending = spending_data[data.gender == 'M']
data_F_spending = spending_data[data.gender == 'F']
daily_amount_spent_M = data_M_spending.groupby(['merchant_state'])['amount'].mean().reset_index()
daily_amount_spent_F = data_F_spending.groupby(['merchant_state'])['amount'].mean().reset_index()
# print(daily_amount_spent_M)
# print(daily_amount_spent_F)
width = 0.25
x = np.arange(len(daily_amount_spent_M.merchant_state))
fig, ax = plt.subplots(figsize=(15,5))
ax.bar( x + 0.15, 'amount', data = daily_amount_spent_M, width=width, label = 'Male', color = 'springgreen')
ax.bar( x - 0.15,  'amount', data = daily_amount_spent_F, width=width,  label = 'Female', color = 'darkgreen')
plt.title('Average Expenses Of Customers In Each State',fontsize = 30,color='black')
plt.xticks(np.arange(8), ['ACT', 'NSW', 'NT', 'QLD', 'SA', 'TAS', 'VIC', 'WA'])
plt.xlabel('States', fontsize = 20)
plt.ylabel('Amount in Dollar', fontsize =20)
plt.xticks(rotation = 45)
plt.legend()
plt.show()

In [None]:
data3 = data_M.groupby(['date','merchant_state'], as_index = False)['amount'].sum()
data4 = data_F.groupby(['date','merchant_state'], as_index = False)['amount'].sum()

# print(data3.head(15))

fix, ax = plt.subplots(figsize=(15,5))
sns.lineplot(x = 'date', y = 'amount', hue = 'merchant_state', data = data3 )
plt.title("Average Daily Expenditure Of 100 All Customers",fontsize = 30,color='black')
plt.xlabel("Date", fontsize = 20)
plt.ylabel("Amount In Dollar", fontsize =20)
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Places Where People Spent Their Money Most

transaction_by_suburb = spending_data.groupby('merchant_suburb')["amount"].sum().reset_index().sort_values(by="amount", ascending = False)
# print(transaction_by_suburb[:10])
# x = sns.color_palette("Oranges", 8)
plt.figure(figsize=(14,5))
sns.barplot(x= transaction_by_suburb.merchant_suburb[:20], y= transaction_by_suburb.amount[:20], color = "orange")                             
plt.xlabel("Places", fontsize=20)
plt.ylabel("Amount in Dollars", fontsize=20)
plt.title("Highest Spending places in Australia", fontsize=30)
plt.xticks(rotation=90)
plt.show()

In [None]:
# People Who Spend The Most

transactions_by_people1 = spending_data.groupby(['account', 'first_name'])['amount'].sum().sort_values( ascending = False).reset_index()
print(transactions_by_people1.head(5))
plt.figure(figsize=(14,5))
sns.barplot(x= transactions_by_people1.first_name[:20], y= transactions_by_people1.amount[:20], color = 'orange')                              
plt.xlabel('First Name', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('People who spent most in Australia in 3 months', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Highest Salary Earners

transactions_by_people1 = salary_data.groupby(['account', 'first_name'])['amount'].sum().sort_values( ascending = False).reset_index()
print(transactions_by_people1.head(5))
plt.figure(figsize=(14,5))
sns.barplot(x= transactions_by_people1.first_name[:20], y= transactions_by_people1.amount[:20], color = 'orange')                              
plt.xlabel('First Name', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('Higest Salary Earners In 3 Months', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Average Expenses By Age

transactions_by_age_group = spending_data.groupby(['age'], as_index = False)['amount'].mean().sort_values(by='amount', ascending = False)
transactions_by_age_group
plt.figure(figsize=(14,5))
sns.barplot(x= transactions_by_age_group.age, y= transactions_by_age_group.amount, color = 'yellow')                              
plt.xlabel('Age', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('Average Expenditure By Age', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Average Salary By Age

salary_by_age_group = salary_data.groupby(['age'], as_index = False)['amount'].mean().sort_values(by='amount', ascending = False)
plt.figure(figsize=(14,5))
sns.barplot(x= salary_by_age_group.age, y= salary_by_age_group.amount, color = 'lime')                              
plt.xlabel('Age', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('Average Salary By Age', fontsize=30)
plt.xticks(rotation=45)
plt.show()

## Wealth Analysis

In [None]:
#Top 20 Wealthiest People

wealthy_people = data.groupby(['month', 'account', 'first_name'])['balance'].mean().reset_index().sort_values(by='balance', ascending = False)
wealthy_people1 = wealthy_people.groupby(['account', 'first_name'])['balance'].mean().sort_values(ascending = False).reset_index()
print(wealthy_people1.head(10))
plt.figure(figsize=(14,5))
sns.barplot(x= wealthy_people1.first_name[:10], y= wealthy_people1.balance[:10], color = 'green')                              
plt.xlabel('Names', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('Top 10 Wealthiest People And Their Avg Monthly Bank Balance', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
# wealth by states

wealthy_states = data.groupby(['merchant_state', 'month'])['balance'].median().reset_index()
wealthy_states1 = wealthy_states.groupby(['merchant_state'])['balance'].mean().reset_index().sort_values(by='balance', ascending = False)
print(wealthy_states1)
plt.figure(figsize=(14,5))
sns.barplot(x= wealthy_states1.merchant_state[:10], y= wealthy_states1.balance[:10])                              
plt.xlabel('Places', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('Customers Median Balance Amount In Each State', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
wealthy_age_group = data.groupby(['age',], as_index = False)['balance'].mean()
# .sort_values(by='balance', ascending = False)
# print(wealthy_age_group)
plt.figure(figsize=(14,5))
sns.barplot(x= wealthy_age_group.age, y= wealthy_age_group.balance)                              
plt.xlabel('Age', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('Median Wealth of different age group', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Age of Top 6 Wealthiest People
age_of_wealthy_people = data.groupby(["first_name", "age"]).balance.mean().reset_index().sort_values(by = "balance", ascending = False)
age_of_wealthy_people[:6]

In [None]:
wealthy_age_group = data.groupby(['age'], as_index = False)['balance'].mean().sort_values(by='balance', ascending = False)
plt.figure(figsize=(14,5))
sns.barplot(x= wealthy_age_group.age, y= wealthy_age_group.balance)                              
plt.xlabel('Age', fontsize=20)
plt.ylabel('Amount in Dollars', fontsize=20)
plt.title('Average Wealth of different age group', fontsize=30)
plt.xticks(rotation=45)
plt.show()

In [None]:
# percentage distribution of wealth 

wealthy_people = data.groupby(['month', 'account', 'first_name'])['balance'].mean().reset_index().sort_values(by='balance', ascending = False)
wealthy_people1 = wealthy_people.groupby(['account', 'first_name'])['balance'].mean().sort_values(ascending = False).reset_index()
start = 1
end = 5000

for i in range(14):
    x = wealthy_people1[(wealthy_people1["balance"] > start) & (wealthy_people1["balance"] < end)]
    y = (x.account.value_counts().sum()/wealthy_people1.account.value_counts().sum())*100
    print(format(y,".2f"),"% people have $", start, "to", end, "in their bank account")
    start += 5000
    end += 5000
    print("_____________")

In [None]:
# Mean Wealth of Male and Females

x = data.groupby(['date', 'gender'], as_index = False).balance.mean()
palette = sns.color_palette('mako_r', 2)
plt.subplots(figsize=(15,5))
sns.lineplot(x.date, x.balance, hue = x.gender, style = x.gender, palette= palette)                            
plt.xlabel('Dates', fontsize=20, color = 'midnightblue')
plt.ylabel('Bank Balance in Dollars', fontsize=20, color = 'midnightblue')
plt.title('Average bank balance of all males and females from 10-Aug to 30-Oct', fontsize=22, color = 'midnightblue')
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Mean Wealth of Male and Females

x = data.groupby(['date', 'gender'], as_index = False).balance.median()
palette = sns.color_palette('mako_r', 2)
plt.subplots(figsize=(15,5))
sns.lineplot(x.date, x.balance, hue = x.gender, style = x.gender, palette= palette)                            
plt.xlabel('Dates', fontsize=20, color = 'midnightblue')
plt.ylabel('Bank Balance in Dollars', fontsize=20, color = 'midnightblue')
plt.title('Median bank balance of all males and females from 10-Aug to 30-Oct', fontsize=22, color = 'midnightblue')
plt.xticks(rotation = 45)
plt.show()

## Map of 100 Wealthiest Customers

In [None]:
m_1 = folium.Map(location=[-28.865143, 135.209900], tiles= 'OpenStreetMap' , zoom_start=4.2)
geo_data_wealthy = (data.groupby(['first_name', 'latitude', 'longitude'], as_index = False).balance.sum().sort_values(by='balance', ascending=False))[:100]
geo_data = gpd.GeoDataFrame(geo_data_wealthy, geometry=gpd.points_from_xy(geo_data_wealthy.longitude, geo_data_wealthy.latitude))
geo_data.crs = {'init' : 'epsg:4326'}
mc = MarkerCluster()
for idx, row in geo_data.iterrows():
    Marker([row['latitude'], row['longitude']]).add_to(m_1)
m_1

In [None]:
# Map of all customers

m_5 = folium.Map(location=[-28.865143, 135.209900], tiles= 'OpenStreetMap' , zoom_start=4.2)
HeatMap(data=data[['latitude', 'longitude']], radius=10).add_to(m_5)
m_5