# Insurance Claim Prediction

## *Importing libraries*

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE
import warnings
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')

## *Loading the Dataset*

In [3]:
df = pd.read_csv('/content/train.csv')

In [4]:
df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9.0,1.0,5.0,8.0,0.0,1.0,1.0,0.0,0.0,1.0
1,9,0,1,1,7,0,0,0,0,1,...,3.0,1.0,1.0,9.0,0.0,1.0,1.0,0.0,1.0,0.0
2,13,0,5,4,9,1,0,0,0,1,...,4.0,2.0,7.0,7.0,0.0,1.0,1.0,0.0,1.0,0.0
3,16,0,0,1,2,0,0,1,0,0,...,2.0,2.0,4.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,0,0,2,0,1,0,1,0,0,...,3.0,1.0,1.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0


## *Basic Check*

In [5]:
df.describe()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,54189.0,54189.0,54189.0,54189.0,54189.0,54189.0,54189.0,54189.0,54189.0,54189.0,...,54188.0,54188.0,54188.0,54188.0,54188.0,54188.0,54188.0,54188.0,54188.0,54188.0
mean,67964.929875,0.036631,1.901807,1.354205,4.41501,0.417188,0.406706,0.392829,0.25579,0.163982,...,5.463479,1.444803,2.874806,7.528364,0.122942,0.631191,0.556212,0.288219,0.350963,0.154241
std,39165.995506,0.187856,1.986736,0.658812,2.694641,0.493398,1.353058,0.488384,0.436308,0.370262,...,2.347443,1.203797,1.68934,2.751524,0.328374,0.482486,0.496835,0.452938,0.477276,0.361183
min,7.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,34090.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,67856.0,0.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,...,5.0,1.0,3.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,101912.0,0.0,3.0,2.0,6.0,1.0,0.0,1.0,1.0,0.0,...,7.0,2.0,4.0,9.0,0.0,1.0,1.0,1.0,1.0,0.0
max,135982.0,1.0,7.0,4.0,11.0,1.0,6.0,1.0,1.0,1.0,...,19.0,9.0,12.0,22.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54189 entries, 0 to 54188
Data columns (total 59 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              54189 non-null  int64  
 1   target          54189 non-null  int64  
 2   ps_ind_01       54189 non-null  int64  
 3   ps_ind_02_cat   54189 non-null  int64  
 4   ps_ind_03       54189 non-null  int64  
 5   ps_ind_04_cat   54189 non-null  int64  
 6   ps_ind_05_cat   54189 non-null  int64  
 7   ps_ind_06_bin   54189 non-null  int64  
 8   ps_ind_07_bin   54189 non-null  int64  
 9   ps_ind_08_bin   54189 non-null  int64  
 10  ps_ind_09_bin   54189 non-null  int64  
 11  ps_ind_10_bin   54189 non-null  int64  
 12  ps_ind_11_bin   54189 non-null  int64  
 13  ps_ind_12_bin   54189 non-null  int64  
 14  ps_ind_13_bin   54189 non-null  int64  
 15  ps_ind_14       54189 non-null  int64  
 16  ps_ind_15       54189 non-null  int64  
 17  ps_ind_16_bin   54189 non-null 

In [7]:
df.isnull().sum()

Unnamed: 0,0
id,0
target,0
ps_ind_01,0
ps_ind_02_cat,0
ps_ind_03,0
ps_ind_04_cat,0
ps_ind_05_cat,0
ps_ind_06_bin,0
ps_ind_07_bin,0
ps_ind_08_bin,0


In [8]:
df.shape

(54189, 59)

In [9]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,52204
1,1985


## *Seperate features and target*

In [10]:
X = df.drop('target', axis=1)
y = df['target']

In [17]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [18]:
df.isnull().sum()

Unnamed: 0,0
id,0
target,0
ps_ind_01,0
ps_ind_02_cat,0
ps_ind_03,0
ps_ind_04_cat,0
ps_ind_05_cat,0
ps_ind_06_bin,0
ps_ind_07_bin,0
ps_ind_08_bin,0


In [12]:
y

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
54184,0
54185,0
54186,0
54187,0


## *Feature Scaling*

In [20]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

## *Train-Test split*

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## *Applying SMOTE on data*

In [22]:
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

- ## Model Training and Evaluation

## *Initialising the model*

## *Training the model*

## *1. Logistic Regression*

In [24]:
## without SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, precision_recall_curve, auc, confusion_matrix
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression - Accuracy: 0.9635541612843698
Confusion Matrix:
 [[10443     0]
 [  395     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     10443
           1       0.00      0.00      0.00       395

    accuracy                           0.96     10838
   macro avg       0.48      0.50      0.49     10838
weighted avg       0.93      0.96      0.95     10838



In [25]:
model_lr = LogisticRegression()
model_lr.fit(X_smote, y_smote)
y_pred_lr = model_lr.predict(X_test)
print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression - Accuracy: 0.5874700129175124
Confusion Matrix:
 [[6145 4298]
 [ 173  222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.59      0.73     10443
           1       0.05      0.56      0.09       395

    accuracy                           0.59     10838
   macro avg       0.51      0.58      0.41     10838
weighted avg       0.94      0.59      0.71     10838



## *2. Decision Tree*

In [26]:
model_dt = DecisionTreeClassifier(class_weight='balanced')
model_dt.fit(X_smote, y_smote)
y_pred_dt = model_dt.predict(X_test)
print("Decision Tree - Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))

Decision Tree - Accuracy: 0.9133603985975273
Confusion Matrix:
 [[9862  581]
 [ 358   37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95     10443
           1       0.06      0.09      0.07       395

    accuracy                           0.91     10838
   macro avg       0.51      0.52      0.51     10838
weighted avg       0.93      0.91      0.92     10838



## *3. Random Forest*

In [27]:
model_rf = RandomForestClassifier()
model_rf.fit(X_smote, y_smote)
y_pred_rf = model_rf.predict(X_test)
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest - Accuracy: 0.9635541612843698
Confusion Matrix:
 [[10443     0]
 [  395     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     10443
           1       0.00      0.00      0.00       395

    accuracy                           0.96     10838
   macro avg       0.48      0.50      0.49     10838
weighted avg       0.93      0.96      0.95     10838



In [28]:
## *4. Gradient Boosting*

In [29]:
model_gb = GradientBoostingClassifier()
model_gb.fit(X_smote, y_smote)
y_pred_gb = model_gb.predict(X_test)
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))

Random Forest - Accuracy: 0.9635541612843698
Confusion Matrix:
 [[10443     0]
 [  395     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     10443
           1       0.00      0.00      0.00       395

    accuracy                           0.96     10838
   macro avg       0.48      0.50      0.49     10838
weighted avg       0.93      0.96      0.95     10838



In [30]:
model_xgb = XGBClassifier()
model_xgb.fit(X_smote, y_smote)
y_pred_xgb = model_xgb.predict(X_test)
print("XGBoost - Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

XGBoost - Accuracy: 0.9637386971766009
Confusion Matrix:
 [[10442     1]
 [  392     3]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     10443
           1       0.75      0.01      0.02       395

    accuracy                           0.96     10838
   macro avg       0.86      0.50      0.50     10838
weighted avg       0.96      0.96      0.95     10838



In [32]:
# Evaluate Models using Precision-Recall AUC
def evaluate_model_pr_auc(model, X_test, y_test):
    y_pred = model.predict(X_test)
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    auc_score = auc(recall, precision)
    return auc_score

# Evaluate models using precision-recall curve AUC for both SMOTE and ADASYN
def evaluate_pr_auc(models, X_smote, y_smote):
    pr_auc_scores = {}
    for model_name, model in models.items():
        model.fit(X_smote, y_smote)
        pr_auc_scores[model_name] = evaluate_model_pr_auc(model, X_test, y_test)
    return pr_auc_scores

# Define the models dictionary here
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # Include XGBoost
}


pr_auc_smote = evaluate_pr_auc(models, X_smote, y_smote)

# Convert PR AUC scores to DataFrame for better visualization
pr_auc_df_smote = pd.DataFrame(pr_auc_smote, index=["PR AUC"]).T
print("PR AUC with SMOTE:")
print(pr_auc_df_smote)

PR AUC with SMOTE:
                       PR AUC
Logistic Regression  0.313551
Decision Tree        0.094874
Random Forest        0.518223
Gradient Boosting    0.518223
XGBoost              0.396882


- Random Forest and Gradient Boosting are performing relatively well for this imbalanced dataset, while Logistic Regression and XGBoost might require further tuning or alternative techniques like hyperparameter tuning, feature engineering, or class weighting adjustments.

## *Applying hyperparameter tuning on XGBoost*

In [33]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight': [(len(y_smote) - sum(y_smote)) / sum(y_smote)]
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_smote, y_smote)
print("Best Parameters:", grid_search.best_params_)
print("Best AUC Score:", grid_search.best_score_)
best_xgb_model = grid_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)
y_pred_proba = best_xgb_model.predict_proba(X_test)[:, 1]
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150, 'scale_pos_weight': 1.0, 'subsample': 0.6}
Best AUC Score: 0.9883470629240154
ROC AUC Score: 0.5987401166307271
Confusion Matrix:
 [[10443     0]
 [  395     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     10443
           1       0.00      0.00      0.00       395

    accuracy                           0.96     10838
   macro avg       0.48      0.50      0.49     10838
weighted avg       0.93      0.96      0.95     10838

