In [8]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [9]:
def load_data():
    # Load the dataset
    df = pd.read_csv('fraud_oracle.csv')

    # Remove duplicates and missing values
    df = df.drop_duplicates()
    df = df.dropna()
    return df

In [11]:
def clean_data(df):
    # Encode categorical variables
    le = LabelEncoder()
    cols_to_encode = ['Make', 'AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType', 'VehicleCategory', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'AddressChange_Claim', 'BasePolicy']
    df[cols_to_encode] = df[cols_to_encode].apply(le.fit_transform)


# Convert string variables to floats
def convert_to_float(val):
    if isinstance(val, int):
        return float(val)
    elif isinstance(val, float):
        return val
    elif 'more than' in val:
        return float(re.findall(r'\d+', val)[0])
    elif 'to' in val:
        return sum(map(float, re.findall(r'\d+', val))) / 2
    elif 'years' in val:
        return float(re.findall(r'\d+', val)[0])
    else:
        return None

cols_to_convert = ['Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'NumberOfSuppliments', 'NumberOfCars', 'VehiclePrice']
df[cols_to_convert] = df[cols_to_convert].apply(lambda col: col.apply(convert_to_float))
df['Age'] = df['Age'].astype(float)

# Standardize numerical features
scaler = StandardScaler()
cols_to_scale = ['Age', 'Deductible', 'DriverRating', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'NumberOfSuppliments', 'NumberOfCars']
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Create new features
df['TotalNumberOfClaims'] = df['PastNumberOfClaims'] + df['Days_Policy_Accident']
df['PolicyAge'] = 2023 - df['Year']


def build_models(X, y):
    # Support Vector Machine (SVM) Classifier
    svm_model = SVC(kernel='linear', random_state=42)
    svm_scores = cross_val_score(svm_model, X, y, cv=5)
    print("SVM Accuracy: %0.2f (+/- %0.2f)" % (svm_scores.mean(), svm_scores.std() * 2))


    # Decision Tree Classifier
    dt_model = DecisionTreeClassifier(random_state=42)
    dt_scores = cross_val_score(dt_model, X, y, cv=5)
    print("Decision Tree Accuracy: %0.2f (+/- %0.2f)" % (dt_scores.mean(), dt_scores.std() * 2))

NameError: name 'df' is not defined