In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [3]:
data = pd.read_csv('data/insurance_claims.csv')

In [10]:
data.head(5)
data.shape

(1000, 39)

In [11]:
data = data.replace('?', np.nan)

In [12]:
dropable_columns = ['policy_annual_premium','policy_number','policy_bind_date','policy_state','policy_csl','insured_zip','incident_location','incident_date','incident_state','incident_city','auto_make','auto_model','auto_year']

In [13]:
data.drop(dropable_columns,1,inplace=True)

In [14]:
data.shape

(1000, 26)

In [15]:
data.isnull().sum()

months_as_customer               0
age                              0
policy_deductable                0
umbrella_limit                   0
insured_sex                      0
insured_education_level          0
insured_occupation               0
insured_hobbies                  0
insured_relationship             0
capital-gains                    0
capital-loss                     0
incident_type                    0
collision_type                 178
incident_severity                0
authorities_contacted            0
incident_hour_of_the_day         0
number_of_vehicles_involved      0
property_damage                360
bodily_injuries                  0
witnesses                        0
police_report_available        343
total_claim_amount               0
injury_claim                     0
property_claim                   0
vehicle_claim                    0
fraud_reported                   0
dtype: int64

In [16]:
data_mode = dict(data.mode().iloc[0])

In [17]:
data = data.fillna(data_mode)

In [18]:
cat_df = data.select_dtypes(include=['object']).copy()

In [20]:
num_df = data.select_dtypes(include=["int64"]).copy()

In [24]:
cat_df.columns

Index(['insured_sex', 'insured_education_level', 'insured_occupation',
       'insured_hobbies', 'insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'property_damage', 'police_report_available', 'fraud_reported'],
      dtype='object')

In [26]:
labelencoder_dict = {}
for column in cat_df.columns:
    le = LabelEncoder()
    labelencoder_dict[column] = le
    cat_df[column] = le.fit_transform(cat_df[column])

In [81]:
pickle.dump(labelencoder_dict, open('lableencoderdict.pkl','wb'))

In [31]:
final_df = pd.concat([num_df,cat_df], axis=1)

In [34]:
target_value = 'fraud_reported'
X = final_df.drop([target_value],1)
y = final_df[target_value]

In [35]:
train_X, test_X , train_y, test_y = train_test_split(X, y, test_size =0.3, random_state = 2020)

In [36]:
clf = RandomForestClassifier(n_estimators=100)

In [37]:
pred_y = clf.fit(train_X, train_y).predict(test_X)

In [38]:
accuracy_score(test_y, pred_y)

0.7733333333333333

In [39]:
import pickle

In [40]:
pickle.dump(clf, open('model_encodings_saved.pkl','wb'))

In [41]:
model = pickle.load(open('model_encodings_saved.pkl','rb'))

In [42]:
json_input = {
  "months_as_customer": 328,
  "age": 29,
  "policy_deductable": 1000,
  "umbrella_limit": 5000000,
  "capital-gains": 35100,
  "capital-loss": 0,
  "incident_hour_of_the_day": 5,
  "number_of_vehicles_involved": 1,
  "bodily_injuries": 2,
  "witnesses": 0,
  "total_claim_amount": 65000,
  "injury_claim": 1300,
  "property_claim": 650,
  "vehicle_claim": 5200,
  "insured_sex": 1,
  "insured_education_level": 4,
  "insured_occupation": 6,
  "insured_hobbies": 2,
  "insured_relationship": 4,
  "incident_type": 2,
  "collision_type": 2,
  "incident_severity": 1,
  "authorities_contacted": 4,
  "property_damage": 1,
  "police_report_available": 1
  }

In [43]:
model.predict([list(json_input.values())])

array([0])

In [82]:
model.predict_proba([list(json_input.values())])

array([[0.89, 0.11]])

In [49]:
json_encode = {
  "months_as_customer": 328,
  "age": 29,
  "policy_deductable": 1000,
  "umbrella_limit": 5000000,
  "capital-gains": 35100,
  "capital-loss": 0,
  "incident_hour_of_the_day": 5,
  "number_of_vehicles_involved": 1,
  "bodily_injuries": 2,
  "witnesses": 0,
  "total_claim_amount": 65000,
  "injury_claim": 1300,
  "property_claim": 650,
  "vehicle_claim": 5200,
  "insured_sex": "MALE",
  "insured_education_level": "MD",
  "insured_occupation": "sales",
  "insured_hobbies": "reading",
  "insured_relationship": "unmarried",
  "incident_type": "Vehicle Theft",
  "collision_type": "Side Collision",
  "incident_severity": "Major Damage",
  "authorities_contacted": "Police",
  "property_damage": "YES",
  "police_report_available": "YES"
  }

In [54]:
encode_df = pd.DataFrame(json_encode,index=['i',])

In [63]:
test_cat_df = encode_df.select_dtypes(include=['object']).copy()

In [64]:
test_num_df = encode_df.select_dtypes(include=["int64"]).copy()

In [66]:
for column in test_cat_df.columns:
    label_encoder =  labelencoder_dict[column]
    test_cat_df[column] = label_encoder.transform(test_cat_df[column])

In [70]:
final_test_df = pd.concat([test_num_df,test_cat_df], axis=1)

In [71]:
final_test_df

Unnamed: 0,months_as_customer,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,...,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,incident_type,collision_type,incident_severity,authorities_contacted,property_damage,police_report_available
i,328,29,1000,5000000,35100,0,5,1,2,0,...,4,11,15,4,3,2,0,4,1,1


In [76]:
df_list = final_test_df.values.tolist()

In [79]:
type(df_list)

list

In [78]:
model.predict(df_list)

array([0])

In [83]:
model.predict_proba(df_list)

array([[0.67, 0.33]])

In [None]:
def getEncoded(test_data,labelencoder_dict):
    test_encoded_x = None
    for i in range(0,test_data.shape[1]):
        label_encoder =  labelencoder_dict[i]
        feature = label_encoder.transform(test_data[:,i])
        feature = feature.reshape(test_data.shape[0], 1)
        if test_encoded_x is None:
          test_encoded_x = feature
        else:
          test_encoded_x = np.concatenate((test_encoded_x, feature), axis=1)
  return test_encoded_x