## Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv("data/fraud_oracle.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Gets the unique values for each column
for column in df:
    print(column,":\n",df[column].unique(),'\n')

In [None]:
df['PolicyNumber'].unique()
df['RepNumber'].unique()

In [None]:
# This data is irelevant for training the model and may lead to overfitting
df = df.drop(columns="PolicyNumber")
df = df.drop(columns="RepNumber")

In [None]:
# Getting rid of non-logical values
df[['DayOfWeekClaimed', 'MonthClaimed']] = df[['DayOfWeekClaimed', 'MonthClaimed']].replace(0, np.nan)
df[['Days_Policy_Accident','Days_Policy_Claim','PastNumberOfClaims','NumberOfSuppliments']] = df[['Days_Policy_Accident','Days_Policy_Claim','PastNumberOfClaims','NumberOfSuppliments']].replace('none', '0')
df['AgeOfVehicle'] = df['AgeOfVehicle'].replace('new', '0')
df['AddressChange_Claim'] = df['AddressChange_Claim'].replace('no change', '0')

In [None]:
# These columns use strings values to represent numerical values which don't help with model training
# These values must be in numerical form in order to be used in the model
pattern = r'(\d+)'
columns = ['NumberOfCars','NumberOfSuppliments','AgeOfPolicyHolder','AgeOfVehicle','PastNumberOfClaims', 'Days_Policy_Claim', 'Days_Policy_Accident', 'VehiclePrice']

for column in columns:
    # Convert the column to string type
    df[column] = df[column].astype(str)
    
    # Use regex to extract the first number from each value
    df[column] = df[column].str.extract(pattern, expand=False)

    # Replace non-numeric values with NaN
    df[column] = pd.to_numeric(df[column], errors='coerce')

    # Optionally, fill NaN values with a default value
    df[column].fillna(0, inplace=True)

    

In [None]:
for column in df:
    print(column,":\n",df[column].unique(),'\n')

In [None]:
fraud_counts = df['FraudFound_P'].value_counts()
print(fraud_counts)

In [None]:
df['Age'].unique()

In [None]:
features = [i  for i in df.columns if df[i].dtype==object and i != 'FraudFound_P']

# Calculate the number of rows and columns for the grid
num_rows = 8
num_cols = 3

# Create subplots with the specified layout
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 20))
plt.subplots_adjust(wspace=0.4, hspace=0.5)

# Loop through features and create countplots in the subplots
for i, feature in enumerate(features):
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]
    
    sns.countplot(x=feature, data=df, ax=ax,hue='FraudFound_P')
    ax.set_title(f'Countplot of {feature}')
    ax.set_xlabel('')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=45)
    
# Hide any empty subplots
for i in range(len(features), num_rows * num_cols):
    row = i // num_cols
    col = i % num_cols
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.show()

In [None]:
for i in features:
    print(round(df[i].value_counts()/len(df)*100,2))
    print("="*50)

### Removing Irrelevant Features based on Frequency

In [None]:
df = df.copy(deep=True)
for feature in features:
    value_percent = round(df[feature].value_counts()/len(df)*100,2)
    values_to_remove = value_percent[value_percent <=8].index
    df = df[~df[feature].isin(values_to_remove)]
    
for feature in features:
    unique_classes = df[feature].nunique()
    if unique_classes <= 1:
        df.drop(columns=[feature], inplace=True)

In [None]:
features = [i  for i in df.columns if df[i].dtype==object and i != 'FraudFound_P']
num_rows = 8
num_cols = 3

# Create subplots with the specified layout
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 20))
plt.subplots_adjust(wspace=0.4, hspace=0.5)

# Loop through features and create countplots in the subplots
for i, feature in enumerate(features):
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]
    
    sns.countplot(x=feature, data=df, ax=ax,hue='FraudFound_P')
    ax.set_title(f'Countplot of {feature}')
    ax.set_xlabel('')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=45)
    
# Hide any empty subplots
for i in range(len(features), num_rows * num_cols):
    row = i // num_cols
    col = i % num_cols
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.show()