<a href="https://colab.research.google.com/github/ranjith13119/Insurance-Fraud-Classification/blob/main/Insurance_Fraud_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import os
# os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/Kaggle"
# %cd /content/drive/MyDrive/Insurance Fraud Detection
# !kaggle datasets download -d buntyshah/auto-insurance-claims-data
# #unzipping the zip files and deleting the zip files
# !unzip \*.zip  && rm *.zipy

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split



In [4]:
class Preprocessor:
  def __init__(self, df):
    self.data = df

  def remove_unwanted_spaces(self):
    try:
      self.df_without_space = self.data.apply(lambda x : x.str.strip() if x.dtype == 'object' else x)
      return self.df_without_space
    except Exception as err:
      raise Exception()
  
  def remove_columns(self, data, columns):
    self.data = data
    self.remove_col = columns
    try:
      self.useful_data = self.data.drop(self.remove_col, axis = 1)
      return self.useful_data
    except Exception as err:
      raise Exception()
  
  def separate_label_feature(self, data, label_column_name):
    try:
      self.X = data.drop(labels=label_column_name,axis=1)
      self.y = data[label_column_name]
      return self.X, self.y
    except Exception as e:
      raise Exception()

  def isnull_present(self, data):
    self.null_present = False
    self.cols_with_missing_values = []
    self.cols = data.columns
    try:
      self.null_counts = data.isnull().sum()
      for i in range(len(self.null_counts)):
        if self.null_counts[i] > 0 :
          self.null_present = True
          self.cols_with_missing_values.append(self.cols[i])
      return self.null_present, self.cols_with_missing_values
    except Exception as err:
      raise Exception()
  
  def impute_missing_values(self, data, cols_with_missing_values):
    self.cols_with_missing_values = cols_with_missing_values
    self.data = data
    try:
      self.Imputer = CategoricalImputer()
      for col in self.cols_with_missing_values:
        self.data[col] = self.Imputer.fit_transform(self.data[col])
      return self.data
    except Exception as err:
      raise Exception()
  
  def scale_numerical_feature(self, data):
    self.data = data
    self.num_df = self.data[['months_as_customer', 'policy_deductable', 'umbrella_limit',
                          'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
                          'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'injury_claim', 'property_claim', 'vehicle_claim']]
    try:
      self.scaler = StandardScaler()
      self.scaled_data = self.scaler.fit_transform(self.num_df)
      self.scaled_num_df = pd.DataFrame(data=self.scaled_data, columns=self.num_df.columns,index=self.data.index)
      self.data.drop(columns=self.scaled_num_df.columns, inplace=True)
      self.data = pd.concat([self.scaled_num_df, self.data], axis=1)
      return self.data
    except Exception as err:
      raise Exception()
  def handle_imbalanced_dataset(self,x,y):
    try:
      self.os = SMOTETomek(0.75)
      self.x_sampled,self.y_sampled = self.os.fit_sample(x,y)
      return self.x_sampled,self.y_sampled
    except Exception as err:
      raise Exception()
  def encode_catrgorical_coulmns(self, data):
    self.data=data
    try:
      self.cat_df = self.data.select_dtypes(include=['object']).copy()
      #print(self.cat_df.info())
      self.cat_df['policy_csl'] = self.cat_df['policy_csl'].map({'100/300': 1, '250/500': 2.5, '500/1000': 5})
      self.cat_df['insured_education_level'] = self.cat_df['insured_education_level'].map({'JD': 1, 'High School': 2, 'College': 3, 'Masters': 4, 'Associate': 5, 'MD': 6, 'PhD': 7}) # ordinalEncoding
      self.cat_df['incident_severity'] = self.cat_df['incident_severity'].map({'Trivial Damage': 1, 'Minor Damage': 2, 'Major Damage': 3, 'Total Loss': 4})
      self.cat_df['insured_sex'] = self.cat_df['insured_sex'].map({'FEMALE': 0, 'MALE': 1})
      self.cat_df['property_damage'] = self.cat_df['property_damage'].map({'NO': 0, 'YES': 1})
      self.cat_df['police_report_available'] = self.cat_df['police_report_available'].map({'NO': 0, 'YES': 1})
      try:
        self.cat_df['fraud_reported'] = self.cat_df['fraud_reported'].map({'N': 0, 'Y': 1}) # only during the training state.
        self.cols_to_drop=['policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex', 'property_damage', 'police_report_available', 'fraud_reported']
      except Exception as err:
        self.cols_to_drop=['policy_csl', 'insured_education_level', 'incident_severity', 'insured_sex', 'property_damage', 'police_report_available', 'fraud_reported']
      
      for col in self.cat_df.drop(columns=self.cols_to_drop).columns:
        self.cat_df = pd.get_dummies(self.cat_df, columns=[col], prefix=[col], drop_first=True) 
      self.data.drop(columns=self.data.select_dtypes(include=['object']).columns, inplace=True)
      self.data= pd.concat([self.cat_df,self.data],axis=1)      
      return self.data
    except Exception as e:
      raise Exception()      


In [21]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier 
from sklearn.metrics  import roc_auc_score,accuracy_score, confusion_matrix, classification_report

In [28]:
class Model_Finder:
  def __init__(self):
    self.sv_classifier=SVC()
    self.xgb = XGBClassifier(objective='binary:logistic',n_jobs=-1) 
    self.RF = RandomForestClassifier()
  def get_best_params_for_xgboost(self, train_x, train_y):
    try:
      self.param_grid_xgboost = { "n_estimators": [100, 130], "criterion": ['gini', 'entropy'], "max_depth": range(8, 10, 1) }
      self.grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),self.param_grid_xgboost, verbose=3,cv=10)
      self.grid.fit(train_x, train_y)
      self.criterion = self.grid.best_params_['criterion']
      self.max_depth = self.grid.best_params_['max_depth']
      self.n_estimators = self.grid.best_params_['n_estimators']
      self.xgb = XGBClassifier(criterion=self.criterion, max_depth=self.max_depth,n_estimators= self.n_estimators, n_jobs=-1 )
      self.xgb.fit(train_x, train_y)
      return self.xgb
    except Exception as e:
      raise Exception()
  
  def get_best_params_for_svm(self, train_x, train_y):
    try:
      self.param_grid = { "kernel": ['rbf', 'sigmoid'], "C": [0.1, 0.5, 1.0], "random_state": [0, 100, 200, 300] }
      self.grid = GridSearchCV(estimator=self.sv_classifier, param_grid=self.param_grid, cv=10,  verbose=3)  
      self.grid.fit(train_x, train_y)
      self.kernel = self.grid.best_params_['kernel']
      self.C = self.grid.best_params_['C']
      self.random_state = self.grid.best_params_['random_state']
      self.sv_classifier = SVC(kernel=self.kernel, C=self.C, random_state=self.random_state)
      self.sv_classifier.fit(train_x, train_y)
      return self.sv_classifier
    except Exception as e:
      raise Exception()

  def get_best_params_for_RF(self, train_x, train_y):
    try:
      Number of trees in random forest
      n_estimators = [int(x) for x in np.linspace(start = 100, stop = 700, num = 5)]
      # Number of features to consider at every split
      max_features = ['auto', 'sqrt']
      # Maximum number of levels in tree
      max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
      # max_depth.append(None)
      # Minimum number of samples required to split a node
      min_samples_split = [2, 5, 10, 15, 100]
      # Minimum number of samples required at each leaf node
      min_samples_leaf = [1, 2, 5, 10]
      self.random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
      self.grid = GridSearchCV(estimator=self.RF, param_grid=self.random_grid, cv = 5,  verbose=3)  
      self.grid.fit(train_x, train_y)
      self.n_estimators = self.grid.best_params_["n_estimators"]
      self.max_features = self.grid.best_params_["max_features"]
      self.max_depth = self.grid.best_params_["max_depth"]
      self.min_samples_split = self.grid.best_params_["min_samples_split"]
      self.min_samples_leaf = self.grid.best_params_["min_samples_leaf"]
      self.rf_classifier = RandomForestClassifier(n_estimators = self.n_estimators,max_features= self.max_features, max_depth = self.max_depth,
                                                  min_samples_split = self.min_samples_split, min_samples_leaf = self.min_samples_leaf)
      self.rf_classifier.fit(train_x, train_y)
      return self.rf_classifier
    except Exception as e:
      raise Exception

  def get_best_imbalance_model(self, train_x, test_x, train_y, test_y):
     self.rf_balanced_classifier = BalancedBaggingClassifier(base_estimator = self.RF,
                                 sampling_strategy = 'auto',
                                 replacement = False,
                                 random_state = 0)
     self.rf_balanced_classifier.fit(train_x, train_y)
     self.prediction_RF_bal = self.rf_balanced_classifier.predict(test_x)
     if len(test_y.unique()) == 1: 
       self.rf_balance_score = accuracy_score(test_y,self.prediction_RF_bal)
     else:
       self.rf_balance_score = roc_auc_score(test_y, self.prediction_RF_bal)
     return 'BalancedClassifier', self.rf_balanced_classifier, self.rf_balance_score

  def get_best_model(self, train_x, test_x, train_y, test_y):
    try:
      self.xgboost= self.get_best_params_for_xgboost(train_x,train_y)
      self.prediction_xgboost = self.xgboost.predict(test_x)
      if len(test_y.unique()) == 1: #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
          self.xgboost_score = accuracy_score(test_y, self.prediction_xgboost)
      else:
          self.xgboost_score = roc_auc_score(test_y, self.prediction_xgboost) # AUC for XGBoost 
      
      self.svm=self.get_best_params_for_svm(train_x,train_y)
      self.prediction_svm=self.svm.predict(test_x) # prediction using the SVM Algorithm

      if len(test_y.unique()) == 1: #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
          self.svm_score = accuracy_score(test_y,self.prediction_svm)
      else:
          self.svm_score = roc_auc_score(test_y, self.prediction_svm) # AUC for Random Forest
      
      self.RF = self.get_best_params_for_RF(train_x,train_y)
      self.prediction_RF=self.svm.predict(test_x)

      if len(test_y.unique()) == 1: 
          self.rf_score = accuracy_score(test_y,self.prediction_RF)
      else:
          self.rf_score = roc_auc_score(test_y, self.prediction_RF) # AUC for Random Forest

      print("Prediction completed")

      lst = [self.svm_score, self.xgboost_score, self.rf_score] 
      good_score = max(lst)
      #comparing the two models
      if self.svm_score == good_score:
          return 'SVM', self.sv_classifier, self.svm_score
      elif self.xgboost_score == good_score:
          return 'XGBoost', self.xgboost, self.xgboost_score
      else:
        return 'Random Forest', self.RF, self.rf_score
    except Exception as e:
      raise Exception()

In [7]:
import pickle
from collections import Counter

In [31]:
class TrainModel:
  def __init__(self, data):
    self.data = data
  def trainingModel(self):
    try:
      preprocessor = Preprocessor(self.data)
      data = preprocessor.remove_columns(self.data, columns = ['policy_number','_c39','policy_bind_date','policy_state','insured_zip','incident_location','incident_date','incident_state','incident_city','insured_hobbies','auto_make','auto_model','auto_year','age','total_claim_amount'])
      data.replace("?", np.nan, inplace = True)
      is_null_present, cols_with_missing_values = preprocessor.isnull_present(data)
      if (is_null_present):
        data = preprocessor.impute_missing_values(data, cols_with_missing_values)
      data = preprocessor.encode_catrgorical_coulmns(data)
      X,Y = preprocessor.separate_label_feature(data,label_column_name='fraud_reported') 
      self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, Y, test_size = 0.25 , random_state = 0)
      # print(self.X_train.columns, self.X_test.columns)
      self.X_train = preprocessor.scale_numerical_feature(self.X_train)
      self.X_test  = preprocessor.scale_numerical_feature(self.X_test)    
      print("The number of classes before fit {}".format(Counter(self.y_train)))
      model_finder = Model_Finder() 
      balanced_name, balanced_model, balanced_score = model_finder.get_best_imbalance_model(self.X_train, self.X_test, self.y_train, self.y_test)
      self.X_train, self.y_train = preprocessor.handle_imbalanced_dataset(self.X_train, self.y_train)
      print("The number of classes after fit {}".format(Counter(self.y_train)))          
      best_model_name, best_model, best_score= model_finder.get_best_model(self.X_train, self.X_test.values, self.y_train, self.y_test) 
      print("Prediction completed")
      if best_score < balanced_score:
        best_model_name = balanced_name
        best_model = balanced_model
        best_score = balanced_score
      print("best model is {} ".format(best_model_name))
      filename = '/content/drive/MyDrive/Insurance Fraud Detection/best_model_{}.sav'.format(best_model_name)
      pickle.dump(best_model, open(filename, 'wb'))    
      loaded_model = pickle.load(open(filename, 'rb'))
      y_predict = loaded_model.predict(self.X_test.values)
      print(confusion_matrix(self.y_test,y_predict))
      print(accuracy_score(self.y_test,y_predict))
      print(classification_report(self.y_test,y_predict))
    except Exception as e:
      raise Exception()


In [None]:
try:
  data = pd.read_csv("/content/drive/MyDrive/Insurance Fraud Detection/insurance_claims.csv")
  trainModelObj = TrainModel(data)
  trainModelObj.trainingModel()
except Exception as e:
  raise Exception()