<a href="https://colab.research.google.com/github/sara-gregori/Coding-project/blob/main/Gregori_projectwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The following code is based on a CSV dataset containing 5000 insurance claims. The dataset, despite being realistic (532 fraudolent cases out of 5000 total claims) is not based on real-world data but was created by AI with the sole scope of coding an example of the model.
The reason why the dataset used is generated by AI is that the information regarding frauds are sensitive and property of insurance companies.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [None]:
csv_filename = "insurance_fraud_dataset.csv"

df = pd.read_csv(csv_filename)

print(df.head())
print(df.dtypes)


   client_id  policy_id  claim_id  claim_amount accident_date  \
0       8270      59036         1  29671.916425    2020-01-01   
1       1860      68531         2  26494.472163    2020-01-02   
2       6390      22940         3  10548.181180    2020-01-03   
3       6191      26932         4  22867.517710    2020-01-04   
4       6734      66827         5  39948.901449    2020-01-05   

  policy_activation_date  num_past_claims  vehicle_age  \
0             2018-01-01                3           11   
1             2018-01-02                1           18   
2             2018-01-03                3           15   
3             2018-01-04                4            3   
4             2018-01-05                1            7   

   fraud_suspicious_text_score  image_fraud_score  connection_score  \
0                     0.416080           0.629813          0.011036   
1                     0.914068           0.218674          0.654662   
2                     0.824163           0.6701

In [None]:
#converting the dates in a daytime format
df['accident_date'] = pd.to_datetime(df['accident_date'])
df['policy_activation_date'] = pd.to_datetime(df['policy_activation_date'])

#creating new numerical features based on the dates
df['accident_days'] = (df['accident_date'] - pd.to_datetime("2000-01-01")).dt.days
df['policy_activation_days'] = (df['policy_activation_date'] - pd.to_datetime("2000-01-01")).dt.days
df['policy_duration'] = (df['accident_date'] - df['policy_activation_date']).dt.days

#deleting the original columns in a string format
df = df.drop(columns=['accident_date', 'policy_activation_date'])

#checking the format
print(df.dtypes)


client_id                        int64
policy_id                        int64
claim_id                         int64
claim_amount                   float64
num_past_claims                  int64
vehicle_age                      int64
fraud_suspicious_text_score    float64
image_fraud_score              float64
connection_score               float64
fraud_label                      int64
accident_days                    int64
policy_activation_days           int64
policy_duration                  int64
dtype: object


In [None]:
#defining the features (x) and the target variable (y)
X = df.drop(columns=['fraud_label', 'claim_id', 'client_id', 'policy_id'])
y = df['fraud_label']

#dividing the dataset in training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

#verifying the fair distribution of the classes
print("Class distribution in the training set:")
print(y_train.value_counts(normalize=True))


Class distribution in the training set:
fraud_label
0    0.893714
1    0.106286
Name: proportion, dtype: float64


In [None]:
#definying the hyperparameters to test
param_grid = {
    'n_estimators': [50,70],
    'max_depth': [5,7, 9],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 5, 10],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

#creating the model with balanced classes
rf = RandomForestClassifier(random_state=42, class_weight="balanced")

# Grid Search to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

#training the model with the best hyperparameters
grid_search.fit(X_train, y_train)

#best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)

#optimized model
best_rf = grid_search.best_estimator_


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best hyperparameters: {'bootstrap': True, 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 70}


In [None]:
y_pred = best_rf. predict (X_test)
print(y_pred [:50])
#this is the output of the model, revealing that among the first 50 cases 5 cases are identified as frauds (value 1)

[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0]


In [None]:
from sklearn.metrics import accuracy_score

#prediction on the test set
y_pred = best_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

Model accuracy: 0.8573
