In [9]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

CSV_PATH = "Kaggle.csv"
TRAIN_PATH = "train_insurance.csv"
TEST_PATH = "test_insurance.csv"

# --- Load dataset (semicolon delimited) ---
df = pd.read_csv(CSV_PATH, sep=";", encoding="utf-8")

# --- Preprocess ---
def preprocess(df):
    df = df.copy()
    
    # Normalize column names (strip spaces, uppercase)
    df.columns = df.columns.str.strip().str.upper()
    
    candidate_cols = [
        "POLICY TYPE 1", "POLICY TYPE 2", "POLICY TYPE 3",
        "PAYMENT MODE", "POLICY STATUS", "BENEFIT",
        "NON LAPSE GUARANTEED", "PREMIUM", "INITIAL BENEFIT",
        "SEX", "ENTRY AGE"
    ]
    
    # Keep only the ones that exist
    text_parts = [df[c].astype(str) for c in candidate_cols if c in df.columns]
    
    if text_parts:
        df["TEXT_FOR_NLP"] = [" ".join(parts) for parts in zip(*text_parts)]
        df["TEXT_FOR_NLP"] = df["TEXT_FOR_NLP"].str.lower()
    else:
        df["TEXT_FOR_NLP"] = ""
    
    return df

# Preprocess dataset
df = preprocess(df)

# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)

print(f"✅ Data prepared: {len(train_df)} train, {len(test_df)} test.")


✅ Data prepared: 148448 train, 37112 test.


In [11]:
# --- Train TF-IDF ---
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

MODEL_PATH = "insurance_vectorizer.pkl"
MATRIX_PATH = "insurance_tfidf_matrix.pkl"

train_df = pd.read_csv(TRAIN_PATH)
full_df = pd.read_csv(CSV_PATH, sep=";", encoding="utf-8")
full_df = preprocess(full_df)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
vectorizer.fit(train_df["TEXT_FOR_NLP"])   # <-- fixed

# Transform full data
tfidf_matrix = vectorizer.transform(full_df["TEXT_FOR_NLP"])   # <-- fixed

# Save model + matrix
joblib.dump(vectorizer, MODEL_PATH)
joblib.dump({"matrix": tfidf_matrix, "df": full_df}, MATRIX_PATH)

print(f"✅ Model trained and saved to {MODEL_PATH} and {MATRIX_PATH}")


✅ Model trained and saved to insurance_vectorizer.pkl and insurance_tfidf_matrix.pkl


In [13]:
# --- Query function ---
import textwrap
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = joblib.load(MODEL_PATH)
data = joblib.load(MATRIX_PATH)
tfidf_matrix = data["matrix"]
df = data["df"]

# Normalize df column names to uppercase (safety)
df.columns = df.columns.str.strip().str.upper()

def answer_query(query, top_k=3):
    query_vec = vectorizer.transform([query.lower()])
    sims = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_idx = sims.argsort()[::-1][:top_k]

    print(f"\n🔎 Query: {query}")
    for idx in top_idx:
        row = df.iloc[idx]
        snippet = textwrap.shorten(row["TEXT_FOR_NLP"], width=200, placeholder="...")
        premium_val = row["PREMIUM"] if "PREMIUM" in df.columns else "N/A"
        print(f"\n📌 Policy Row {idx} | Score={sims[idx]:.3f}")
        print(f"Age: {row.get('ENTRY AGE','?')} | Sex: {row.get('SEX','?')} | Status: {row.get('POLICY STATUS','?')} | Premium: {premium_val}")
        print(f"Snippet: {snippet}")

# --- Example query ---
answer_query("annual premium policy with inforce status", top_k=3)



🔎 Query: annual premium policy with inforce status

📌 Policy Row 182734 | Score=0.344
Age: 40 | Sex: M | Status: Inforce | Premium:  2 
Snippet: 13 59 a single premium inforce 5,000 no nlg 2 3533.3526 m 40

📌 Policy Row 183279 | Score=0.344
Age: 40 | Sex: M | Status: Inforce | Premium:  2 
Snippet: 13 59 a single premium inforce 6,000 no nlg 2 5252.2296 m 40

📌 Policy Row 182542 | Score=0.344
Age: 40 | Sex: M | Status: Inforce | Premium:  2 
Snippet: 13 59 a single premium inforce 6,000 no nlg 2 3250.7236 m 40
