In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.geocoders import AzureMaps
from geopy.geocoders import Bing
from geopy.geocoders import HereV7
from geopy.distance import geodesic
import time
import numpy as np
from matplotlib import pyplot as plt
import joblib
import seaborn as sns

from sklearn import datasets
from sklearn.model_selection import train_test_split

from diffprivlib.models import RandomForestClassifier as DPRandomForestClassifier
from diffprivlib.models import PCA as DPPCA
from diffprivlib.models import standard_scaler as DPStandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [2]:
# Load Asset Location Data
asset_location_data = pd.read_excel('david_au_customer_data.xlsx')


In [3]:
# Take Random Subsample of Full Dataset with 50000 samples for Further Experimentation 
asset_location_data = asset_location_data.sample(n=50000,random_state=50)

In [4]:
asset_location_data['Known Fraud'] = [0 if elem is np.nan else 1 for elem in asset_location_data['Known Fraud']]

In [5]:
classifier_target = ['Known Fraud']
classifier_input = ['Customer Type','Business Unit','Equipment Type','Scorecard Type']

In [6]:
# Remove Samples with nan values
asset_location_data = asset_location_data[classifier_target+classifier_input].dropna()

In [7]:
asset_location_data[classifier_input] = asset_location_data[classifier_input].astype(str)

In [8]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

asset_location_data[classifier_input] = enc.fit_transform(asset_location_data[classifier_input])

In [9]:
asset_location_data

Unnamed: 0,Known Fraud,Customer Type,Business Unit,Equipment Type,Scorecard Type
110348,0,13.0,17.0,1875.0,0.0
95162,0,17.0,94.0,711.0,6.0
111590,0,21.0,12.0,2150.0,0.0
110948,0,13.0,14.0,2150.0,0.0
108325,0,19.0,95.0,468.0,6.0
...,...,...,...,...,...
36911,0,18.0,17.0,1041.0,0.0
31176,0,17.0,60.0,2221.0,6.0
46580,0,17.0,90.0,468.0,6.0
55859,0,17.0,34.0,2309.0,0.0


In [10]:
asset_location_data[asset_location_data['Known Fraud'] == 1]

Unnamed: 0,Known Fraud,Customer Type,Business Unit,Equipment Type,Scorecard Type
96635,1,17.0,24.0,707.0,0.0
92396,1,17.0,89.0,1524.0,6.0
23873,1,17.0,89.0,1524.0,6.0
109786,1,17.0,24.0,1156.0,6.0
52459,1,17.0,89.0,468.0,6.0
37386,1,21.0,24.0,707.0,0.0
67130,1,17.0,24.0,707.0,6.0
59346,1,21.0,24.0,2035.0,6.0
91414,1,21.0,24.0,1856.0,6.0
46779,1,17.0,24.0,707.0,0.0


In [119]:
from sklearn.utils import resample

# Separate majority and minority classes
asset_location_data_majority = asset_location_data[asset_location_data['Known Fraud'] == 0]
asset_location_data_minority = asset_location_data[asset_location_data['Known Fraud'] == 1]

asset_location_data_minority_upsampled = resample(asset_location_data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=39371,    # to match majority class
                                 random_state=50) # reproducible results
 
# Combine majority class with upsampled minority class
asset_location_data_upsampled = pd.concat([asset_location_data_majority, asset_location_data_minority_upsampled])

asset_location_data = asset_location_data_upsampled

In [120]:
X_train, X_test, y_train, y_test = train_test_split(asset_location_data[classifier_input],asset_location_data[classifier_target], test_size=0.2,random_state=50)

In [130]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=10,criterion="gini",max_depth=5)

model.fit(X_train,y_train.values)


  model.fit(X_train,y_train.values)


RandomForestClassifier(max_depth=5, n_estimators=10)

In [131]:
model.score(X_test,y_test)

0.9319956822655406

In [21]:
# asset_location_data['Customer Type'].unique()
# asset_location_data['Business Unit'].unique()
# asset_location_data['Equipment Type'].unique()
# asset_location_data['Scorecard Type'].unique()

In [15]:
# unique_asset_addresses = asset_location_data['Equipment Address'].unique()
# unique_asset_addresses =  pd.Series(unique_asset_addresses).sample(n=500,random_state=42).values
# unique_asset_addresses.shape


In [4]:
asset_location_data

Unnamed: 0,Application Number,Business Unit,Vendor Name,Dealer Name,Contract Number,Customer Number,Customer Name,Customer Type,Business Address,Registered Business Address,Equipment Address,Accounts Receivable Address,Related Party Address(es),Equipment Type,Equipment Cost,Scorecard Type,Scorecard Score,PD at time of application,BOOK_DT,Known Fraud
0,203161-09,104002.0,LINDE FINANCIAL SERVICES,LINDE MATERIALS HANDLING P/L,029-0125857-014,125857,AUSNET ASSET SERVICES PTY LTD,Company,,,24 MALONEY Drive WODONGA VIC 3690 AUSTRALIA,,,FORKLIFT Materials Handling,96984.59,COM,604.0,,2019-02-28,
1,205433-00,104002.0,LINDE FINANCIAL SERVICES,LINDE MATERIALS HANDLING P/L,029-0095303-002,95303,REMAPAK PTY LTD,Company,,,215 NORTH ROCKS Road NORTH ROCKS NSW 2151 AU...,,,FORKLIFT Materials Handling,70197.02,COM,524.0,,2017-12-30,
2,220631-00,104007.0,GOUGH FINANCE,GOUGH MATERIALS HANDLING HO,056-0139319-002,139319,MARKET GARDENERS LIMITED,Company,,,801 GREAT SOUTH Road MT WELLINGTON 0000 NE...,,,FORKLIFT (NZ) Agri,46175.00,CRE,0.0,,2017-05-30,
3,216609-00,104020.0,POWERLIFT FINANCIAL SERVICES,POWERLIFT HO,169-0135521-002,135521,BOWDEN PRINTING PTY. LTD.,Company,,,26 HINDMARSH Avenue WELLAND SA 5007 AUSTRALIA,,,FORKLIFT Materials Handling,24580.00,AU COMPANY,0.0,,2017-04-13,
4,224445-00,104020.0,POWERLIFT FINANCIAL SERVICES,POWERLIFT HO,169-0131942-001,131942,HURRICANE WIRE PRODUCTS (AUST) PTY LTD,Company,,,6 GEONIC Street WOODRIDGE QLD 4114 AUSTRALIA,,,FORKLIFT Materials Handling,33094.00,COM,556.0,,2017-07-27,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114945,200-2019-200-001,104000.0,CROWN EQUIPMENT LEASING,CROWN HEAD OFFICE,018-0456891-005,456891,VENROSS PTY LTD,Private Company (Pty Ltd),,,22 BUTLER Boulevard ADELAIDE AIRPORT SA 5950...,,,FORKLIFT Materials Handling,23403.00,Commercial,341.0,,2021-05-07,
114946,,,TOSHIBA (AUSTRALIA) PTY LTD,TOSHIBA AUSTRALIA,178-0010066-010,10066,METCASH TRADING LIMITED,,,,111 MAGNESIUM Drive CRESTMEAD QLD 4132 AUSTR...,,,PHOTOCOPIERS Office Equipment,3436.00,,,,2018-09-28,
114947,400558-00,104002.0,LINDE FINANCIAL SERVICES,LINDE MATERIALS HANDLING P/L,029-0068569-100,68569,MYER PTY LTD,Company,,,297 DIAGONAL Road OAKLANDS PARK SA 5046 AUST...,,,FORKLIFT Materials Handling,16375.34,CRE,0.0,,2021-04-30,
114948,401213-00,108004.0,COLOURWORKS AUSTRALIA PTY LTD,COLOURWORKS HEAD OFFICE,194-0181267-004,181267,LIFESTYLE SOLUTIONS (AUST) LTD,Company,,,33 FERN Street ISLINGTON NSW 2296 AUSTRALIA,,,PHOTOCOPIER Office Equipment,6209.52,COMMERCIAL,542.0,,2017-06-21,
