In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
old_df = pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction-data/train.csv')
old_df.head()

In [None]:
new_df = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
new_df.head()

In [None]:
df = pd.concat([new_df, old_df], ignore_index=True)

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
scaler = MinMaxScaler()
numerical_features = ['Age', 'Annual_Premium', 'Vintage', 'Region_Code', 'Policy_Sales_Channel']
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])
df_encoded.head()


In [None]:
corr_matrix = df_encoded.corr().abs()
corr_matrix.head()
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [None]:
threshold = 0.9
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print(to_drop)

In [None]:
df_final = df_encoded.drop(columns=to_drop)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
def train_and_evaluate_model(data, target, model):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict probabilities for the test set
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate the ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    
    # Calculate the AUC score
    auc_score = roc_auc_score(y_test, y_pred_prob)
    
    # Plot the ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {auc_score:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
target = df_final['Response']
data = df_final.drop(columns=['Response'])

In [None]:
from catboost import CatBoostClassifier
catboost_model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0)
train_and_evaluate_model(data, target, catboost_model)

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(solver='liblinear', random_state=42)
train_and_evaluate_model(data, target, logistic_regression_model)

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_model(data, target, decision_tree_model)

In [None]:
from sklearn.svm import SVC
svm_model = SVC(probability=True, random_state=42)
train_and_evaluate_model(data, target, svm_model)

In [None]:
from xgboost import XGBClassifier
xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
train_and_evaluate_model(data, target, xgboost_model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate_model(data, target, random_forest_model)