In [1]:
import os

In [2]:
%pwd

'd:\\vehicle_insurance_fraud_detection\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\vehicle_insurance_fraud_detection'

In [35]:
from dataclasses import dataclass
from pathlib import Path 

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path 

In [36]:
from src.vifd.constants import *
from src.vifd.utils.common import read_yaml, create_directories

In [37]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=CONFIG_FILE_PATH,
            schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config
    

In [38]:
import os
from src.vifd import logger
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTENC

In [43]:
class DataTransformation:
    def __init__(self, config):
        self.config = config

    def load_data(self):
        data = pd.read_csv(self.config.data_path)
        return data

    def encode_categorical_columns(self, data):
        categorical_columns = ['policy_state', 'policy_csl', 'incident_type',
                                'incident_severity', 'authorities_contacted',
                                'incident_state', 'incident_city',
                                'police_report_available', 'auto_make', 'auto_model', 'fraud_reported']

        enc = OneHotEncoder(handle_unknown='ignore', drop='first')
        cat_enc_data = pd.DataFrame(enc.fit_transform(data[categorical_columns]).toarray())
        cat_enc_data.columns = enc.get_feature_names_out()
        return cat_enc_data

    def preprocess_data(self, data, cat_enc_data):
        numerical_columns = ['months_as_customer', 'policy_deductable', 'policy_annual_premium',
                              'umbrella_limit', 'number_of_vehicles_involved', 'bodily_injuries',
                              'witnesses', 'total_claim_amount', 'injury_claim', 'property_claim',
                              'vehicle_claim', 'auto_year']

        df = pd.concat([data[numerical_columns], cat_enc_data], axis=1)
        df.dropna(inplace=True)
        return df

    def scale_numerical_features(self, X_train, X_test, numerical_columns):
        scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[numerical_columns]),
                                      columns=numerical_columns, index=X_train.index)
        X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test[numerical_columns]),
                                     columns=numerical_columns, index=X_test.index)

        for col in numerical_columns:
            X_train[col] = X_train_scaled[col]
            X_test[col] = X_test_scaled[col]

    def train_test_split(self, df):
        X = df.drop('fraud_reported_Y', axis=1)
        y = df['fraud_reported_Y']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        return X_train, X_test, y_train, y_test

    def save_data(self, X_train, X_test, y_train, y_test, part_name):
        X_train.to_csv(os.path.join(self.config.root_dir, f"{part_name}_X_train.csv"), index=False)
        X_test.to_csv(os.path.join(self.config.root_dir, f"{part_name}_X_test.csv"), index=False)
        y_train.to_csv(os.path.join(self.config.root_dir, f"{part_name}_y_train.csv"), index=False)
        y_test.to_csv(os.path.join(self.config.root_dir, f"{part_name}_y_test.csv"), index=False)

    def train_test_splitting(self):
        try:
            data = self.load_data()
            cat_enc_data = self.encode_categorical_columns(data)
            df = self.preprocess_data(data, cat_enc_data)

            numerical_columns = ['months_as_customer', 'policy_deductable', 'policy_annual_premium',
                                  'umbrella_limit', 'number_of_vehicles_involved', 'bodily_injuries',
                                  'witnesses', 'total_claim_amount', 'injury_claim', 'property_claim',
                                  'vehicle_claim', 'auto_year']

            X_train, X_test, y_train, y_test = self.train_test_split(df)

            self.scale_numerical_features(X_train, X_test, numerical_columns)

            self.save_data(X_train, X_test, y_train, y_test, "split")
            
            sm = SMOTENC(categorical_features=np.arange(80, 90), random_state=123, sampling_strategy=.6)

            X_train_re, y_train_re = sm.fit_resample(X_train, y_train)

            self.save_data(X_train_re, X_test, y_train_re, y_test, "balanced_split")

            logger.info("Data split and balanced successfully.")
        except Exception as e:
            raise e


In [44]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_splitting()

except Exception as e:
    raise e

[2024-01-01 03:09:58,019: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-01 03:09:58,019: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-01-01 03:09:58,027: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-01-01 03:09:58,029: INFO: common: created directory at: artifacts]
[2024-01-01 03:09:58,031: INFO: common: created directory at: artifacts/data_transformation]
[2024-01-01 03:09:58,226: INFO: 429273344: Data split and balanced successfully.]


  if self.categorical_features == "auto":
