In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme(style = 'darkgrid')
import matplotlib.pyplot as plt


data = pd.read_csv("/kaggle/input/car-insurance-data/Car_Insurance_Claim.csv")

#show first rows of data
data.head()

In [None]:
#show summary of data
data.describe()
#credit score and mileage will have nulls

In [None]:
#show columns with null value counts
data.isnull().sum()
#as expected, mileage and credit score have almost 10% null values

In [None]:
#replace null values with averages from columns
data["CREDIT_SCORE"] = data["CREDIT_SCORE"].fillna(data["CREDIT_SCORE"].mean())
data["ANNUAL_MILEAGE"] = data["ANNUAL_MILEAGE"].fillna(data["ANNUAL_MILEAGE"].mean())
data.describe()

In [None]:
#show credit score distribution by whether there has been an insurance claim (outcome = 1 for claim filed)
sns.displot(data=data, x="CREDIT_SCORE", col="OUTCOME", kde=True)

In [None]:
#show speeding violations by income level
sns.displot(data=data, x="SPEEDING_VIOLATIONS", col="INCOME", kde=True)
#upper class category has more people overall but have received more speeding violations

In [None]:
#credit score distribution by income levels
sns.catplot(data=data, kind="box", x="INCOME", y="CREDIT_SCORE")

In [None]:
#strip plot showing credit score distributions by income levels broken down further by outcome
sns.stripplot(data=data, x="INCOME", y="CREDIT_SCORE", hue="OUTCOME", linewidth=1)

In [None]:
#strip plot showing credit score distributions by education levels broken down further by outcome
sns.stripplot(data=data, x="EDUCATION", y="CREDIT_SCORE", hue="OUTCOME", linewidth=1)

In [None]:
#distribution of car mileage broken down by whether a claim was filed
sns.catplot(data=data, kind="box", x="OUTCOME", y="ANNUAL_MILEAGE")

In [None]:
#convert categorical variables to numeric
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data["AGE"]=le.fit_transform(data["AGE"])
data["GENDER"]=le.fit_transform(data["GENDER"])
data["RACE"]=le.fit_transform(data["RACE"])
data["DRIVING_EXPERIENCE"]=le.fit_transform(data["DRIVING_EXPERIENCE"])
data["EDUCATION"]=le.fit_transform(data["EDUCATION"])
data["INCOME"]=le.fit_transform(data["INCOME"])
data["VEHICLE_YEAR"]=le.fit_transform(data["VEHICLE_YEAR"])
data["VEHICLE_TYPE"]=le.fit_transform(data["VEHICLE_TYPE"])

#clean dataset of NaN or Inf to avoid ValueError when training model later
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame)
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)
data = clean_dataset(data)

#drop ID and postal_code variables for now - not useful. Separate OUTCOME into target class variable
target = data["OUTCOME"]
data = data.drop("ID", axis=1)
data = data.drop("POSTAL_CODE", axis=1)
data = data.drop("OUTCOME", axis=1)

#scale remaining numeric features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data)
data = pd.DataFrame(scaler.transform(data), columns = data.columns)

data.head()

In [None]:
#split data for classification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3,random_state=333)

#use Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#check model accuracy
from sklearn import metrics
print("Overall Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred))
print("ROC AUC:",metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
print(metrics.classification_report(y_test, y_pred))
#high recall with low precision indicates the model overestimated the number of people who would file an insurance claim

In [None]:
#plot an ROC curve
metrics.plot_roc_curve(model, X_test, y_test)  
plt.show() 