In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv('claims_data.csv')

# Convert target variable to binary
df['insurance_claim'] = (df['insurance_claim'] == 'yes').astype(int)

# Split features and target
X = df.drop(['insurance_claim', 'claim_amount'], axis=1)
y = df['insurance_claim']

# Split the data with stratify to maintain class proportions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

# Identify categorical and numerical features
categorical_features = ['sex', 'smoker', 'region']
numeric_features = ['age', 'bmi', 'steps', 'children']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Create pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.8778


In [5]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import statsmodels.api as sm

# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_transformed = preprocessor.transform(X_test)

# Add a constant to the transformed data
X_train_transformed = sm.add_constant(X_train_transformed)
X_test_transformed = sm.add_constant(X_test_transformed)

# Fit the logistic regression model using statsmodels
model = sm.Logit(y_train, X_train_transformed).fit()

# Make predictions on the test set
y_pred = model.predict(X_test_transformed)

# Convert predictions to binary values (0 or 1)
y_pred_binary = (y_pred > 0.5).astype(int)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy:.4f}")

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
class_report = classification_report(y_test, y_pred_binary)
print("Classification Report:")
print(class_report)

# Print the summary of the model
print(model.summary())


Optimization terminated successfully.
         Current function value: 0.358454
         Iterations 8
Accuracy: 0.8778
Confusion Matrix:
[[152  31]
 [ 23 236]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       183
           1       0.88      0.91      0.90       259

    accuracy                           0.88       442
   macro avg       0.88      0.87      0.87       442
weighted avg       0.88      0.88      0.88       442

                           Logit Regression Results                           
Dep. Variable:        insurance_claim   No. Observations:                  896
Model:                          Logit   Df Residuals:                      886
Method:                           MLE   Df Model:                            9
Date:                Sat, 14 Sep 2024   Pseudo R-squ.:                  0.4718
Time:                        10:21:02   Log-Likelihood:                -321.17
converged:      

In [6]:
# Perform one-hot encoding on the categorical variables
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X)

# Split features and target
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.33, random_state=42, stratify=y
)

# Create a random forest classifier with 100 trees and a random seed of 101
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=101)

# Fit the random forest model
rf_classifier.fit(X_train_encoded, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Optional: Print feature importances
importances = rf_classifier.feature_importances_
feature_names = X.columns
print("\nFeature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")



NameError: name 'RandomForestClassifier' is not defined

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Perform one-hot encoding on the categorical variables
encoder = OneHotEncoder(sparse=False, drop='first')
X_encoded = encoder.fit_transform(X[categorical_features])

# Combine the numeric features with the encoded categorical features
X_combined = np.hstack([
    StandardScaler().fit_transform(X[numeric_features]),
    X_encoded
])

# Split the data
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.33, random_state=42, stratify=y
)

# Create a random forest classifier with 100 trees and a random seed of 101
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=101)

# Fit the random forest model
rf_classifier.fit(X_train_encoded, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_encoded)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Optional: Print feature importances
importances = rf_classifier.feature_importances_

# Get feature names for encoded variables
cat_feature_names = encoder.get_feature_names_out(categorical_features)
feature_names = numeric_features + cat_feature_names.tolist()

print("\nFeature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Extract false negatives and false positives
FN = cm[1, 0]  # False Negatives
FP = cm[0, 1]  # False Positives

print(f"False Negatives (FN): {FN}")
print(f"False Positives (FP): {FP}")
