# Assignment 2: Insurance Data Preprocessing and Classification

This notebook performs the following tasks as part of the assignment:

1. Load and preprocess insurance claim data using **pure Python** (no external libraries).
2. Analyze city-wise statistics to recommend a city for shutdown.
3. Fix and apply a rejection classifier to the `REJECTION_REMARKS` field.

In [2]:
# Step 1: Load CSV manually using basic Python
from datetime import datetime

def load_csv(filepath):
    with open(filepath, "r") as file:
        lines = file.readlines()
    headers = lines[0].strip().split(",")
    rows = [line.strip().split(",") for line in lines[1:] if line.strip()]
    return headers, rows

SyntaxError: invalid syntax (3348172833.py, line 4)

In [None]:
# Step 2: Clean the data
def clean_data(rows):
    cleaned = []
    for row in rows:
        try:
            cleaned.append({
                "CLAIM_ID": row[0],
                "CLAIM_DATE": row[1],
                "CUSTOMER_ID": row[2],
                "CLAIM_AMOUNT": float(row[3]) if row[3] else 0.0,
                "PREMIUM_COLLECTED": float(row[4]) if row[4] else 0.0,
                "PAID_AMOUNT": float(row[5]) if row[5] else 0.0,
                "CITY": row[6].strip().upper() if row[6] else "UNKNOWN",
                "REJECTION_REMARKS": row[7].strip() if len(row) > 7 else ""
            })
        except Exception as e:
            print("Skipping row due to error:", e)
    return cleaned

In [None]:
# Step 3: Load and clean data
headers, rows = load_csv("Insurance_auto_data.csv")
data = clean_data(rows)

## Step 4: City-wise Analysis
We analyze the claims data per city to compute total claims, rejections, premium, paid amounts and loss ratio.

In [None]:
city_stats = {}
for row in data:
    city = row["CITY"]
    if city not in city_stats:
        city_stats[city] = {"claims": 0, "paid": 0.0, "premium": 0.0, "rejections": 0}
    city_stats[city]["claims"] += 1
    city_stats[city]["paid"] += row["PAID_AMOUNT"]
    city_stats[city]["premium"] += row["PREMIUM_COLLECTED"]
    if row["REJECTION_REMARKS"]:
        city_stats[city]["rejections"] += 1

print("City-wise Analysis:")
for city, stats in city_stats.items():
    loss_ratio = stats["paid"] / stats["premium"] if stats["premium"] > 0 else 0
    print(f"{city}: Claims={stats['claims']}, Rejections={stats['rejections']}, Premium={stats['premium']:.2f}, Paid={stats['paid']:.2f}, Loss Ratio={loss_ratio:.2f}")

# Recommend city for shutdown (among specific 4)
shutdown_candidates = ['PUNE', 'KOLKATA', 'RANCHI', 'GUWAHATI']
filtered_city_stats = {k: v for k, v in city_stats.items() if k in shutdown_candidates}
city_ranking = sorted(filtered_city_stats.items(), key=lambda x: (x[1]["rejections"], x[1]["paid"] / x[1]["premium"] if x[1]["premium"] > 0 else 0), reverse=True)
recommended_city = city_ranking[0][0]
print(f"\nRecommended city for shutdown: {recommended_city}")

## Step 5: Rejection Classifier Fix
We classify the rejection remarks using simple rule-based logic.

In [None]:
REJECTION_REASONS_MAP = {
    "fake_document": "Fake_document",
    "not_covered": "Not_Covered",
    "policy_expired": "Policy_expired"
}

def contains_rejection_reason(rejection_text, reason):
    try:
        if rejection_text and isinstance(rejection_text, str):
            return reason.lower() in rejection_text.lower()
        return False
    except Exception as e:
        return False

def map_rejection_reason(rejection_text):
    for reason, label in REJECTION_REASONS_MAP.items():
        if contains_rejection_reason(rejection_text, reason):
            return label
    return "Unknown"

def complex_rejection_classifier(remark_text):
    try:
        if not isinstance(remark_text, str) or len(remark_text.strip()) == 0:
            return "No Remark"
        if contains_rejection_reason(remark_text, "fake_document"):
            return "Fake_document"
        elif contains_rejection_reason(remark_text, "not_covered"):
            return "Not_Covered"
        elif contains_rejection_reason(remark_text, "policy_expired"):
            return "Policy_expired"
        else:
            return map_rejection_reason(remark_text)
    except Exception:
        return "Error"

In [None]:
# Step 6: Apply rejection classifier to each row
for row in data:
    remark = row.get("REJECTION_REMARKS", "")
    row["REJECTION_CLASS"] = complex_rejection_classifier(remark)

print("\nSample with REJECTION_CLASS:")
for d in data[:10]:
    print(d["CLAIM_ID"], "|", d["REJECTION_REMARKS"], "=>", d["REJECTION_CLASS"])