In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing libraries
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv("/kaggle/input/anz-synthesised-transaction-dataset/anz.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.bpay_biller_code.value_counts()

In [None]:
df.drop(['bpay_biller_code','merchant_code'],axis=1,inplace=True)

In [None]:
df.isna().sum()

Here, we find exact number of null values for merchant data

let's check if the missing values are common for rows

In [None]:
import missingno as msno

In [None]:
msno.matrix(df)

we can find the missing values are common with rows

let's confirm by selecting only null based columns and compare pattern

In [None]:
msno.matrix(df[['card_present_flag','merchant_id','merchant_suburb','merchant_state','merchant_long_lat']])

we can now confirm that missing values are at common rows. 
let's eliminate the rows

In [None]:
df.dropna(inplace=True)

Let's recheck the whole dataframe

In [None]:
msno.matrix(df)

In [None]:
print(f"we are left with {df.shape[0]} rows and {df.shape[1]} columns")

let's check for each column

# Status

In [None]:
df.status.value_counts()

In [None]:
plt.plot(df.status)

let's remove this as it has only one value and will create noise in predictive analysis

In [None]:
df.drop(['status'],axis=1,inplace=True)

# card_present_flag

In [None]:
df['card_present_flag'].value_counts()

In [None]:
plt.hist(df['card_present_flag'])

card_present_flag is pretty much imbalanced

# account

In [None]:
df['account'].value_counts()

these are probably the account numbers and won't much add value to the predictions

let's drop this column too

In [None]:
df.drop(['account'],axis=1,inplace=True)

# currency

In [None]:
df['currency'].value_counts()

Dropping this column too because of single value nature

In [None]:
df.drop(['currency'],axis=1,inplace=True)

# long_lat

In [None]:
df.long_lat.value_counts()

1. lets visualise these co-ordinates
2. seperate lat and long for visualisation
3. convert str to float
4. plot on basemap

In [None]:
# new data frame with split value columns 
co_ordinates = df["long_lat"].str.split(" ", n = 1, expand = True) 
  
# making separate first name column from new data frame 
df["longitude"]= co_ordinates[0] 
  
# making separate last name column from new data frame 
df["latitude"]= co_ordinates[1] 

In [None]:
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

In [None]:
from mpl_toolkits.basemap import Basemap
fig = plt.figure(figsize=(12,9))
m = Basemap(projection='mill',
           llcrnrlat = -90,
           urcrnrlat = 90,
           llcrnrlon = -180,
           urcrnrlon = 180,
           resolution = 'c')
m.drawcoastlines()
m.drawparallels(np.arange(-90,90,10),labels=[True,False,False,False])
m.drawmeridians(np.arange(-180,180,30),labels=[0,0,0,1])
sites_lat_y = df['latitude'].tolist()
sites_lon_x = df['longitude'].tolist()
m.scatter(sites_lon_x,sites_lat_y,latlon=True)
plt.title('Basemap', fontsize=20)
plt.show()

In [None]:
df.drop(['long_lat'],axis=1,inplace=True)

# txn_description

In [None]:
df['txn_description'].value_counts()

In [None]:
plt.hist(df['txn_description'])

1. the column data is balanced and evenly distributed
2. let's convert this strings into numeric

In [None]:
cleanup_txn = {"txn_description":{"POS": 1, "SALES-POS":0}}
df.replace(cleanup_txn, inplace=True)

# merchant_id

In [None]:
df['merchant_id'].value_counts()

this data is just rubbish and of no use.

In [None]:
df.drop(['merchant_id'],axis=1,inplace=True)

# first_name

In [None]:
df.first_name.value_counts()

# balance

In [None]:
df.balance.value_counts()

In [None]:
plt.hist(df.balance)

we can see some outliers here

In [None]:
df=df[df['balance']<100000]

In [None]:
plt.hist(df.balance)

# date 

In [None]:
df.date.value_counts()

In [None]:
type(df.date[0])

converting string to date type 

splitting date into only month and year

In [None]:

df['date']= pd.to_datetime(df['date']) 
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month

In [None]:
df.year.value_counts()

since we have all the transactions in year 2018, it's obvious to remove this column

In [None]:
df.drop(['date','year'],axis=1,inplace=True)

In [None]:
df.month.value_counts()

we can see only that transactions happened only during august, september and october

# gender

In [None]:
df['gender'].value_counts()

good proportion.

let's convert this into numeric form

In [None]:
cleanup_gender = {"gender":{"M": 1, "F":0}}
df.replace(cleanup_gender, inplace=True)

# age

In [None]:
plt.hist(df.age)

there are some outlier like values but they can have some meaning. 

let's not remove them.

# merchant_suburb

In [None]:
df['merchant_suburb'].nunique()

encoding such unique values will increase shape and noise.

let's not make any chances to this column

# merchant_state

In [None]:
df['merchant_state'].value_counts()

In [None]:
df.extraction.value_counts()

these values are probably the timestamps

let's remove for now.

In [None]:
df.drop(['extraction'],axis=1,inplace=True)

# amount

In [None]:
plt.hist(df.amount)

In [None]:
df.amount.max()

this seems as an outlier

In [None]:
perc =[.80,.90,.99] 
df.amount.describe(percentiles=perc)

In [None]:
df=df[df['amount']<378]

# transaction_id

In [None]:
df['transaction_id'].value_counts()

In [None]:
df.drop(['transaction_id'],axis=1,inplace=True)

# country

In [None]:
df['country'].value_counts()

In [None]:
df.drop(['country'],axis=1,inplace=True)

# customer_id

In [None]:
df['customer_id'].value_counts()

In [None]:
df.drop(['customer_id'],axis=1,inplace=True)

In [None]:
df.columns

# merchant_long_lat

In [None]:
df['merchant_long_lat'].value_counts()

In [None]:
df.drop(['merchant_long_lat'],axis=1,inplace=True)

# movement

In [None]:
df['movement'].value_counts()

single class. drop

In [None]:
df.drop(['movement'],axis=1,inplace=True)

# final data

In [None]:
df