In [2]:
pip install faker

Collecting faker
  Downloading faker-37.3.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.3.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.9 MB 1.7 MB/s eta 0:00:02
   --------- ------------------------------ 0.5/1.9 MB 3.7 MB/s eta 0:00:01
   ------------------ --------------------- 0.9/1.9 MB 5.8 MB/s eta 0:00:01
   ---------------------------- ----------- 1.4/1.9 MB 6.2 MB/s eta 0:00:01
   ---------------------------------------  1.9/1.9 MB 7.2 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 6.9 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.3.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()
np.random.seed(42)

def generate_dataset(n_rows=80000):
    data = []
    policy_types = ['Health', 'Life', 'Auto', 'Home', 'Travel']
    occupations = ['Engineer', 'Doctor', 'Teacher', 'Sales', 'Manager', 'Clerk', 'Self-Employed']
    education_levels = ['High School', 'Bachelors', 'Masters', 'PhD']
    channels = ['Online', 'Agent', 'Branch']
    locations = ['Urban', 'Suburban', 'Rural']
    genders = ['Male', 'Female', 'Other']
    
    for i in range(n_rows):
        customer_id = f"CUST{100000 + i}"
        age = np.random.randint(18, 75)
        gender = random.choice(genders)
        marital_status = random.choice(['Single', 'Married', 'Divorced'])
        income = np.random.randint(20000, 200000)
        occupation = random.choice(occupations)
        education = random.choice(education_levels)
        location = random.choice(locations)

        policy_id = f"POL{500000 + i}"
        policy_type = random.choice(policy_types)
        policy_term = random.choice([1, 5, 10, 15, 20])
        premium_amount = round(np.random.uniform(200, 3000), 2)
        coverage_amount = premium_amount * random.randint(50, 150)
        purchase_date = fake.date_between(start_date='-10y', end_date='today')
        previous_policies_count = np.random.poisson(lam=1)

        last_claim_date = fake.date_between(start_date=purchase_date, end_date='today') if random.random() < 0.4 else None
        num_claims = np.random.poisson(lam=1) if last_claim_date else 0
        customer_tenure = round(np.random.uniform(0.5, 15), 1)
        channel_of_purchase = random.choice(channels)
        satisfaction_score = np.random.randint(1, 6)
        contact_frequency = np.random.poisson(lam=2)

        # Intelligent Recommendation Engine (simplified logic)
        recommended = np.random.rand() < 0.6  # 60% of customers receive a recommendation
        recommended_product_type = random.choice(policy_types) if recommended else None
        conversion = recommended and (np.random.rand() < 0.35)  # 35% conversion rate if recommended

        data.append([
            customer_id, age, gender, marital_status, income, occupation, education, location,
            policy_id, policy_type, policy_term, premium_amount, coverage_amount,
            purchase_date, previous_policies_count,
            last_claim_date, num_claims, customer_tenure, channel_of_purchase,
            satisfaction_score, contact_frequency,
            recommended, recommended_product_type, conversion
        ])
    
    columns = [
        "customer_id", "age", "gender", "marital_status", "income", "occupation", "education", "location",
        "policy_id", "policy_type", "policy_term", "premium_amount", "coverage_amount",
        "purchase_date", "previous_policies_count",
        "last_claim_date", "num_claims", "customer_tenure", "channel_of_purchase",
        "satisfaction_score", "contact_frequency",
        "product_recommended", "recommended_product_type", "conversion"
    ]
    
    return pd.DataFrame(data, columns=columns)

# Generate dataset
df = generate_dataset()
df.to_csv("insurance_customer_data.csv", index=False)
print("Dataset created and saved as 'insurance_customer_data.csv'.")


Dataset created and saved as 'insurance_customer_data.csv'.


In [4]:
Data = pd.read_csv('insurance_customer_data.csv')
Data

Unnamed: 0,customer_id,age,gender,marital_status,income,occupation,education,location,policy_id,policy_type,...,previous_policies_count,last_claim_date,num_claims,customer_tenure,channel_of_purchase,satisfaction_score,contact_frequency,product_recommended,recommended_product_type,conversion
0,CUST100000,56,Male,Single,166867,Clerk,Masters,Rural,POL500000,Home,...,2,,0,2.8,Agent,3,2,False,,False
1,CUST100001,70,Male,Married,20769,Sales,Bachelors,Rural,POL500001,Auto,...,1,2023-08-27,3,0.8,Branch,3,1,True,Home,False
2,CUST100002,64,Female,Married,186845,Doctor,PhD,Urban,POL500002,Life,...,1,,0,1.2,Agent,3,2,False,,False
3,CUST100003,31,Female,Divorced,194073,Clerk,High School,Suburban,POL500003,Auto,...,0,,0,10.4,Branch,4,0,True,Auto,True
4,CUST100004,31,Male,Divorced,55920,Engineer,Bachelors,Suburban,POL500004,Life,...,1,,0,8.0,Online,2,1,False,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,CUST179995,45,Female,Single,47811,Teacher,PhD,Urban,POL579995,Home,...,1,,0,9.6,Agent,5,1,True,Travel,False
79996,CUST179996,39,Other,Single,61968,Manager,Masters,Rural,POL579996,Life,...,1,2018-02-03,0,10.0,Agent,3,1,False,,False
79997,CUST179997,56,Male,Married,145338,Doctor,Bachelors,Rural,POL579997,Life,...,1,2022-03-13,0,12.7,Branch,3,2,True,Health,True
79998,CUST179998,70,Male,Married,117639,Manager,High School,Suburban,POL579998,Life,...,2,,0,1.1,Agent,2,1,True,Travel,False


In [7]:
Data.to_excel('Insurance_data.xlsx')

In [8]:
Data1 = pd.read_excel('Insurance_data.xlsx')
pd.set_option('display.max_columns', 24)

In [9]:
Data1

Unnamed: 0.1,Unnamed: 0,customer_id,age,gender,marital_status,income,occupation,education,location,policy_id,policy_type,policy_term,...,coverage_amount,purchase_date,previous_policies_count,last_claim_date,num_claims,customer_tenure,channel_of_purchase,satisfaction_score,contact_frequency,product_recommended,recommended_product_type,conversion
0,0,CUST100000,56,Male,Single,166867,Clerk,Masters,Rural,POL500000,Home,1,...,320544.00,2023-11-24,2,,0,2.8,Agent,3,2,False,,False
1,1,CUST100001,70,Male,Married,20769,Sales,Bachelors,Rural,POL500001,Auto,10,...,122188.00,2022-12-05,1,2023-08-27,3,0.8,Branch,3,1,True,Home,False
2,2,CUST100002,64,Female,Married,186845,Doctor,PhD,Urban,POL500002,Life,20,...,83499.90,2022-11-21,1,,0,1.2,Agent,3,2,False,,False
3,3,CUST100003,31,Female,Divorced,194073,Clerk,High School,Suburban,POL500003,Auto,1,...,70545.64,2024-02-22,0,,0,10.4,Branch,4,0,True,Auto,True
4,4,CUST100004,31,Male,Divorced,55920,Engineer,Bachelors,Suburban,POL500004,Life,10,...,102628.38,2017-10-23,1,,0,8.0,Online,2,1,False,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,79995,CUST179995,45,Female,Single,47811,Teacher,PhD,Urban,POL579995,Home,20,...,237802.32,2023-10-02,1,,0,9.6,Agent,5,1,True,Travel,False
79996,79996,CUST179996,39,Other,Single,61968,Manager,Masters,Rural,POL579996,Life,10,...,174901.68,2016-03-04,1,2018-02-03,0,10.0,Agent,3,1,False,,False
79997,79997,CUST179997,56,Male,Married,145338,Doctor,Bachelors,Rural,POL579997,Life,15,...,311731.20,2018-06-03,1,2022-03-13,0,12.7,Branch,3,2,True,Health,True
79998,79998,CUST179998,70,Male,Married,117639,Manager,High School,Suburban,POL579998,Life,20,...,152075.00,2018-07-17,2,,0,1.1,Agent,2,1,True,Travel,False


In [10]:
Data1 = Data1.drop(columns=['Unnamed: 0'], errors='ignore')

In [11]:
Data1.reset_index(drop=True, inplace=True)

In [12]:
Data1.isnull().sum()

customer_id                     0
age                             0
gender                          0
marital_status                  0
income                          0
occupation                      0
education                       0
location                        0
policy_id                       0
policy_type                     0
policy_term                     0
premium_amount                  0
coverage_amount                 0
purchase_date                   0
previous_policies_count         0
last_claim_date             48049
num_claims                      0
customer_tenure                 0
channel_of_purchase             0
satisfaction_score              0
contact_frequency               0
product_recommended             0
recommended_product_type    31917
conversion                      0
dtype: int64

In [13]:
Data1.shape

(80000, 24)

In [17]:
Data1 = pd.read_excel('C:/Users/dis895\Downloads/Insurance_data.xlsx',index_col=False )

In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_excel("insurance_data.xlsx")

# Drop ID columns and text fields that are not predictive
df.drop(columns=['customer_id', 'policy_id', 'purchase_date', 'last_claim_date'], inplace=True)

# Target variable (optional)
target = 'conversion'

# List of categorical and numeric features
categorical_features = [
    'gender', 'marital_status', 'occupation', 'education',
    'location', 'policy_type', 'channel_of_purchase', 'recommended_product_type'
]

numeric_features = [
    'age', 'income', 'policy_term', 'premium_amount', 'coverage_amount',
    'previous_policies_count', 'num_claims', 'customer_tenure',
    'satisfaction_score', 'contact_frequency'
]

# Separate target from features
X = df.drop(columns=[target])
y = df[target]

# Preprocessing for numeric features: fill missing with median
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Preprocessing for categorical features: fill missing with 'missing', then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# Combine preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Apply transformations
X_processed = preprocessor.fit_transform(X)

# Optionally convert to DataFrame (if you want column names later)
# You’ll need to retrieve one-hot encoded column names manually if needed
X_processed_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed)

# Now X_processed_df is ready for modeling
print("Processed dataset shape:", X_processed_df.shape)


Processed dataset shape: (80000, 36)


In [21]:
X_processed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,...,24,25,26,27,28,29,30,31,32,33,34,35
0,56.0,166867.0,1.0,2862.00,320544.00,2.0,0.0,2.8,3.0,2.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,70.0,20769.0,10.0,2221.60,122188.00,1.0,3.0,0.8,3.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,64.0,186845.0,20.0,759.09,83499.90,1.0,0.0,1.2,3.0,2.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,31.0,194073.0,1.0,1052.92,70545.64,0.0,0.0,10.4,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.0,55920.0,10.0,924.58,102628.38,1.0,0.0,8.0,2.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,45.0,47811.0,20.0,1887.32,237802.32,1.0,0.0,9.6,5.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79996,39.0,61968.0,10.0,2159.28,174901.68,1.0,0.0,10.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
79997,56.0,145338.0,15.0,2534.40,311731.20,1.0,0.0,12.7,3.0,2.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
79998,70.0,117639.0,20.0,1520.75,152075.00,2.0,0.0,1.1,2.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
