In [160]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [122]:
dataset = pd.read_csv("fraud_oracle.csv")

dataset['Age'] = dataset['Age'].replace('0', np.nan)
dataset['DayOfWeekClaimed'] = dataset['DayOfWeekClaimed'].replace('0', np.nan)
dataset['MonthClaimed'] = dataset['MonthClaimed'].replace('0', np.nan)
dataset = dataset.dropna()
i = 0
for col in dataset.columns:
    print(i, col, dataset[col].unique())
    i+=1
print(dataset.isnull().values.any())

0 Month ['Dec' 'Jan' 'Oct' 'Jun' 'Feb' 'Nov' 'Apr' 'Mar' 'Aug' 'Jul' 'May' 'Sep']
1 WeekOfMonth [5 3 2 4 1]
2 DayOfWeek ['Wednesday' 'Friday' 'Saturday' 'Monday' 'Tuesday' 'Sunday' 'Thursday']
3 Make ['Honda' 'Toyota' 'Ford' 'Mazda' 'Chevrolet' 'Pontiac' 'Accura' 'Dodge'
 'Mercury' 'Jaguar' 'Nisson' 'VW' 'Saab' 'Saturn' 'Porche' 'BMW' 'Mecedes'
 'Ferrari' 'Lexus']
4 AccidentArea ['Urban' 'Rural']
5 DayOfWeekClaimed ['Tuesday' 'Monday' 'Thursday' 'Friday' 'Wednesday' 'Saturday' 'Sunday']
6 MonthClaimed ['Jan' 'Nov' 'Jul' 'Feb' 'Mar' 'Dec' 'Apr' 'Aug' 'May' 'Jun' 'Sep' 'Oct']
7 WeekOfMonthClaimed [1 4 2 3 5]
8 Sex ['Female' 'Male']
9 MaritalStatus ['Single' 'Married' 'Widow' 'Divorced']
10 Age [21 34 47 65 27 20 36  0 30 42 71 52 28 61 38 41 32 40 63 31 45 60 39 55
 35 44 72 29 37 59 49 50 26 48 64 33 74 23 25 56 16 68 18 51 22 53 46 43
 57 54 69 67 19 78 77 75 80 58 73 24 76 62 79 70 17 66]
11 Fault ['Policy Holder' 'Third Party']
12 PolicyType ['Sport - Liability' 'Sport - Collision' '

In [123]:
X = dataset.loc[:, dataset.columns != 'FraudFound_P']
print(X)

      Month  WeekOfMonth  DayOfWeek     Make AccidentArea DayOfWeekClaimed  \
0       Dec            5  Wednesday    Honda        Urban          Tuesday   
1       Jan            3  Wednesday    Honda        Urban           Monday   
2       Oct            5     Friday    Honda        Urban         Thursday   
3       Jun            2   Saturday   Toyota        Rural           Friday   
4       Jan            5     Monday    Honda        Urban          Tuesday   
...     ...          ...        ...      ...          ...              ...   
15415   Nov            4     Friday   Toyota        Urban          Tuesday   
15416   Nov            5   Thursday  Pontiac        Urban           Friday   
15417   Nov            5   Thursday   Toyota        Rural           Friday   
15418   Dec            1     Monday   Toyota        Urban         Thursday   
15419   Dec            2  Wednesday   Toyota        Urban         Thursday   

      MonthClaimed  WeekOfMonthClaimed     Sex MaritalStatus  .

In [124]:
y = dataset.iloc[:, [15]]
print(y)

       FraudFound_P
0                 0
1                 0
2                 0
3                 0
4                 0
...             ...
15415             1
15416             0
15417             1
15418             0
15419             1

[15419 rows x 1 columns]


<h3>Binary Encoding</h3>

In [125]:
X_new = dataset[['AccidentArea', 'Sex', 'PoliceReportFiled', 'WitnessPresent', 'Fault', 'AgentType', 'WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'RepNumber', 'Deductible', 'DriverRating']].copy()
print(X_new.AccidentArea.unique())
print(X_new.Sex.unique())
print(X_new.PoliceReportFiled.unique())
print(X_new.WitnessPresent.unique())
X_new['AccidentArea'] = X_new['AccidentArea'].replace({'Urban':1, 'Rural':0})
X_new['Sex'] = X_new['Sex'].replace({'Male':1, 'Female':0})
X_new['PoliceReportFiled'] = X_new['PoliceReportFiled'].replace({'Yes':1, 'No':0})
X_new['WitnessPresent'] = X_new['WitnessPresent'].replace({'Yes':1, 'No':0})
X_new['Fault'] = X_new['Fault'].replace({'Policy Holder':1, 'Third Party': 0})
X_new['AgentType'] = X_new['AgentType'].replace({'External':1, 'Internal':0})

['Urban' 'Rural']
['Female' 'Male']
['No' 'Yes']
['No' 'Yes']


<h3>Ordinal Encoding</h3>

In [126]:
label_encoder_month = LabelEncoder()
label_encoder_day = LabelEncoder()
X_new['Month']= label_encoder_month.fit_transform(X['Month'])
X_new['MonthClaimed'] = label_encoder_month.fit_transform(X['MonthClaimed'])
X_new['DayOfWeek'] = label_encoder_day.fit_transform(X['DayOfWeek'])
X_new['DayOfWeekClaimed'] = label_encoder_day.fit_transform(X['DayOfWeekClaimed'])
print(X_new)

       AccidentArea  Sex  PoliceReportFiled  WitnessPresent  Fault  AgentType  \
0                 1    0                  0               0      1          1   
1                 1    1                  1               0      1          1   
2                 1    1                  0               0      1          1   
3                 0    1                  1               0      0          1   
4                 1    0                  0               0      0          1   
...             ...  ...                ...             ...    ...        ...   
15415             1    1                  0               0      1          1   
15416             1    1                  0               0      1          1   
15417             0    1                  0               0      1          1   
15418             1    0                  0               0      0          1   
15419             1    1                  0               0      1          1   

       WeekOfMonth  WeekOfM

<h3>Nominal Encoding</h3>

<h6>Mapping</h6>

In [127]:
make_mapping = {'Honda':0, 'Toyota':1, 'Ford':2, 'Mazda':3, 'Chevrolet':4, 'Pontiac':5, 'Accura':6, 'Dodge':7, 'Mercury':8, 
                'Jaguar':9, 'Nisson':10, 'VW':11, 'Saab':12, 'Saturn':13, 'Porche':14, 'BMW':15, 'Mecedes':16, 'Ferrari':17, 
                'Lexus':18}
marital_status_mapping = {'Single':0, 'Married':1, 'Widow':2, 'Divorced':3}
policy_type_mapping = {'Sport - Liability':0, 'Sport - Collision':1, 'Sedan - Liability':2, 'Utility - All Perils':3,
                       'Sedan - All Perils':4, 'Sedan - Collision':5, 'Utility - Collision':6, 'Utility - Liability':7,
                       'Sport - All Perils':8}
vehicle_category_mapping = {'Sport':0, 'Utility':1, 'Sedan':2}
vehicle_price_mapping = {'more than 69000':5, '20000 to 29000':1, '30000 to 39000':2, 'less than 20000':0,
                         '40000 to 59000':3, '60000 to 69000':4}
days_policy_accident_mapping = {'none':0, '1 to 7':1, '8 to 15':2, '15 to 30':3, 'more than 30':4}
days_policy_claim_mapping = {'none':0, '8 to 15':1, '15 to 30':2, 'more than 30':3}
past_number_of_claims_mapping = {'none':0, '1':1, '2 to 4':2, 'more than 4':3}
age_of_vehicle_mapping = {'new':0 , '2 years':1, '3 years':2, '4 years':3, '5 years':4, '6 years':5, '7 years':6, 
                          'more than 7':7}
age_of_policy_holder_mapping = {'16 to 17':0, '18 to 20':1, '21 to 25':2, '26 to 30':3, '31 to 35':4, '36 to 40':5, 
                                '41 to 50':6, '51 to 65':7, 'over 65':8}
number_of_suppliments_mapping = {'none':0,  '1 to 2':1,  '3 to 5':2, 'more than 5':3}
address_change_claim_mapping = {'no change':0, 'under 6 months':1, '1 year':2,  '2 to 3 years':3, '4 to 8 years':4}
number_of_cars_mapping = {'1 vehicle':0, '2 vehicles':1, '3 to 4':2, '5 to 8':3, 'more than 8':4}
year_mapping = {1994:0, 1995:1, 1996:2}
base_policy_mapping = {'Liability':0, 'Collision':1, 'All Perils':1}

In [128]:
X_new['Make'] = X.Make.map(make_mapping)
X_new['MaritalStatus'] = X.MaritalStatus.map(marital_status_mapping)
X_new['PolicyType'] = X.PolicyType.map(policy_type_mapping)
X_new['VehicleCategory'] = X.VehicleCategory.map(vehicle_category_mapping)
X_new['VehiclePrice'] = X.VehiclePrice.map(vehicle_price_mapping)
X_new['Days_Policy_Accident'] = X.Days_Policy_Accident.map(days_policy_accident_mapping)
X_new['Days_Policy_Claim'] = X.Days_Policy_Claim.map(days_policy_claim_mapping)
X_new['PastNumberOfClaims'] = X.PastNumberOfClaims.map(past_number_of_claims_mapping)
X_new['AgeOfVehicle'] = X.AgeOfVehicle.map(age_of_vehicle_mapping)
X_new['AgeOfPolicyHolder'] = X.AgeOfPolicyHolder.map(age_of_policy_holder_mapping)
X_new['NumberOfSuppliments'] = X.NumberOfSuppliments.map(number_of_suppliments_mapping)
X_new['AddressChange_Claim'] = X.AddressChange_Claim.map(address_change_claim_mapping)
X_new['NumberOfCars'] = X.NumberOfCars.map(number_of_cars_mapping)
X_new['Year'] = X.Year.map(year_mapping)
X_new['BasePolicy'] = X.BasePolicy.map(base_policy_mapping)
print(len(X_new.columns), len(X.columns))

31 32


In [192]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(12335, 31) (3084, 31) (12335, 1) (3084, 1)


In [193]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [194]:
ann = Sequential()
ann.add(Dense(units=31, activation='relu'))
ann.add(Dense(units=62, activation='relu'))
ann.add(Dense(units=124, activation='relu'))
ann.add(Dense(units=62, activation='relu'))
ann.add(Dense(units=31, activation='relu'))
ann.add(Dense(units=15, activation='relu'))
ann.add(Dense(units=1, activation='sigmoid'))

In [195]:
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [196]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x25f95bc4eb0>

In [197]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), 1))

[[0 0]
 [0 1]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [198]:
cm = confusion_matrix(y_test.values, y_pred)
print(cm)
accuracy_score(y_test.values, y_pred)

[[2811   83]
 [ 179   11]]


0.9150453955901426