In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.preprocessing import StandardScaler

class HiddenNaiveBayes:
    def __init__(self):
        self.classes = None
        self.class_priors = None
        self.feature_likelihoods = None

    def train(self, X, y):
        self.classes = np.unique(y)
        self.class_priors = self.calculate_class_priors(y)
        self.feature_likelihoods = self.calculate_feature_likelihoods(X, y)

    def calculate_class_priors(self, y):
        class_counts = {}
        for label in y:
            if label in class_counts:
                class_counts[label] += 1
            else:
                class_counts[label] = 1

        total_samples = len(y)
        class_priors = {}
        for label, count in class_counts.items():
            class_priors[label] = count / total_samples

        return class_priors

    def calculate_feature_likelihoods(self, X, y):
        feature_likelihoods = {}
        for label in self.classes:
            feature_likelihoods[label] = {}
            relevant_samples = X[y == label]

            for feature in range(X.shape[1]):
                feature_values = relevant_samples[:, feature]
                feature_counts = {}
                for value in feature_values:
                    if value in feature_counts:
                        feature_counts[value] += 1
                    else:
                        feature_counts[value] = 1

                total_samples = len(feature_values)
                for value, count in feature_counts.items():
                    feature_likelihoods[label][feature, value] = count / total_samples

        return feature_likelihoods

    def predict(self, X):
        predictions = []
        for sample in X:
            scores = {}
            for label in self.classes:
                scores[label] = np.log(self.class_priors[label])

                for feature, value in enumerate(sample):
                    if (feature, value) in self.feature_likelihoods[label]:
                        scores[label] += np.log(self.feature_likelihoods[label][feature, value])

            predicted_label = max(scores, key=scores.get)
            predictions.append(predicted_label)

        return predictions

# Load the dataset
df = pd.read_csv('Insurance_claims.csv')
d=df

# Preprocess the dataset
le = LabelEncoder()
df['fraud_reported'] = le.fit_transform(df['fraud_reported'])
df['policy_state'] = le.fit_transform(d['policy_state'])
df['policy_csl'] = le.fit_transform(d['policy_csl'])
df['insured_sex'] = le.fit_transform(d['insured_sex'])
df['insured_education_level'] = le.fit_transform(d['insured_education_level'])
df['insured_occupation'] = le.fit_transform(d['insured_occupation'])
df['insured_hobbies'] = le.fit_transform(d['insured_hobbies'])
df['insured_relationship'] = le.fit_transform(d['insured_relationship'])
df['incident_date'] = le.fit_transform(d['incident_date'])
df['incident_type'] = le.fit_transform(d['incident_type'])
df['collision_type'] = le.fit_transform(d['collision_type'])
df['incident_severity'] = le.fit_transform(d['incident_severity'])
df['authorities_contacted'] = le.fit_transform(d['authorities_contacted'])
df['incident_state'] = le.fit_transform(d['incident_state'])
df['incident_city'] = le.fit_transform(d['incident_city'])
df['incident_location'] = le.fit_transform(d['incident_location'])
df['property_damage'] = le.fit_transform(d['property_damage'])
df['police_report_available'] = le.fit_transform(d['police_report_available'])
df['auto_make'] = le.fit_transform(d['auto_make'])
df['auto_model'] = le.fit_transform(d['auto_model'])

# Select the relevant features for classification
selected_features = ['months_as_customer', 'age','policy_number','policy_bind_date','policy_state','policy_csl',
                     'policy_deductable','policy_annual_premium', 'umbrella_limit','insured_zip',  'insured_sex',
                     'insured_education_level', 'insured_occupation','insured_occupation', 'insured_relationship',
                     'capital-gains', 'capital-loss','incident_date', 'incident_type', 'collision_type',
                     'incident_severity', 'authorities_contacted','incident_state','incident_city','incident_location',
                     'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage',
                     'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount',
                     'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make', 'auto_model', 'auto_year',
                     'fraud_reported']

df = df[selected_features]

# Convert incident_date to numerical feature
df['incident_date'] = pd.to_datetime(df['incident_date'])
df['incident_date'] = (df['incident_date'] - df['incident_date'].min()).dt.days

# Drop the features you want to remove
features_to_drop = ['insured_zip','incident_state','incident_city','incident_location','policy_bind_date','incident_date','insured_occupation','policy_number','insured_education_level']
df = df.drop(features_to_drop, axis=1)



# Split the dataset into features and labels
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

sc=StandardScaler()

X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

# Train the Hidden Naive Bayes classifier
hnb = HiddenNaiveBayes()
hnb.train(X_train, y_train)

# Make predictions on the test set
y_pred = hnb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)






Accuracy: 0.81
Confusion Matrix:
[[162   0]
 [ 38   0]]
