In [1]:
# Assignment 2: Insurance Data Preprocessing and Classification
# Note: No external libraries like pandas or numpy are used.

from datetime import datetime

# Load CSV manually
def load_csv(filepath):
    with open(filepath, "r") as file:
        lines = file.readlines()
    headers = lines[0].strip().split(",")
    rows = [line.strip().split(",") for line in lines[1:] if line.strip()]
    return headers, rows

# Clean data using pure Python
def clean_data(rows):
    cleaned = []
    for row in rows:
        try:
            cleaned.append({
                "CLAIM_ID": row[0],
                "CLAIM_DATE": row[1],
                "CUSTOMER_ID": row[2],
                "CLAIM_AMOUNT": float(row[3]) if row[3] else 0.0,
                "PREMIUM_COLLECTED": float(row[4]) if row[4] else 0.0,
                "PAID_AMOUNT": float(row[5]) if row[5] else 0.0,
                "CITY": row[6].strip().upper() if row[6] else "UNKNOWN",
                "REJECTION_REMARKS": row[7].strip() if len(row) > 7 else ""
            })
        except Exception as e:
            print("Skipping row due to error:", e)
    return cleaned

# Load and clean data
headers, rows = load_csv("Insurance_auto_data.csv")
data = clean_data(rows)

# --- City Analysis ---
# Total premium, paid, and claim loss ratio per city
city_stats = {}
for row in data:
    city = row["CITY"]
    if city not in city_stats:
        city_stats[city] = {"claims": 0, "paid": 0.0, "premium": 0.0, "rejections": 0}
    city_stats[city]["claims"] += 1
    city_stats[city]["paid"] += row["PAID_AMOUNT"]
    city_stats[city]["premium"] += row["PREMIUM_COLLECTED"]
    if row["REJECTION_REMARKS"]:
        city_stats[city]["rejections"] += 1

# Display city analysis
print("City-wise Analysis:")
for city, stats in city_stats.items():
    loss_ratio = stats["paid"] / stats["premium"] if stats["premium"] > 0 else 0
    print(f"{city}: Claims={stats['claims']}, Rejections={stats['rejections']}, "
          f"Premium={stats['premium']:.2f}, Paid={stats['paid']:.2f}, Loss Ratio={loss_ratio:.2f}")

# Recommend city with highest loss ratio + rejections
city_ranking = sorted(city_stats.items(), key=lambda x: (x[1]["rejections"], x[1]["paid"] / x[1]["premium"] if x[1]["premium"] > 0 else 0), reverse=True)
recommended_city = city_ranking[0][0]
print(f"\nRecommended city for shutdown: {recommended_city}")

# --- Fix the Rejection Classifier ---
REJECTION_REASONS_MAP = {
    "fake_document": "Fake_document",
    "not_covered": "Not_Covered",
    "policy_expired": "Policy_expired"
}

def contains_rejection_reason(rejection_text, reason):
    try:
        if rejection_text and isinstance(rejection_text, str):
            return reason.lower() in rejection_text.lower()
        return False
    except Exception as e:
        return False

def map_rejection_reason(rejection_text):
    for reason, label in REJECTION_REASONS_MAP.items():
        if contains_rejection_reason(rejection_text, reason):
            return label
    return "Unknown"

def complex_rejection_classifier(remark_text):
    try:
        if not isinstance(remark_text, str) or len(remark_text.strip()) == 0:
            return "No Remark"
        if contains_rejection_reason(remark_text, "fake_document"):
            return "Fake_document"
        elif contains_rejection_reason(remark_text, "not_covered"):
            return "Not_Covered"
        elif contains_rejection_reason(remark_text, "policy_expired"):
            return "Policy_expired"
        else:
            return map_rejection_reason(remark_text)
    except Exception:
        return "Error"

# Apply classifier
for row in data:
    remark = row.get("REJECTION_REMARKS", "")
    row["REJECTION_CLASS"] = complex_rejection_classifier(remark)

# Show sample classified data
print("\nSample with REJECTION_CLASS:")
for d in data[:10]:
    print(d["CLAIM_ID"], "|", d["REJECTION_REMARKS"], "=>", d["REJECTION_CLASS"])


City-wise Analysis:
PUNE: Claims=37, Rejections=3, Premium=369254.79, Paid=1093273.28, Loss Ratio=2.96
GUWAHATI: Claims=24, Rejections=3, Premium=261314.84, Paid=753189.43, Loss Ratio=2.88
RANCHI: Claims=17, Rejections=2, Premium=148858.60, Paid=401142.67, Loss Ratio=2.69
KOLKATA: Claims=16, Rejections=0, Premium=140279.78, Paid=636392.57, Loss Ratio=4.54
UNKNOWN: Claims=6, Rejections=2, Premium=73993.21, Paid=56295.74, Loss Ratio=0.76

Recommended city for shutdown: PUNE

Sample with REJECTION_CLASS:
CLM100021 |  => No Remark
CLM100013 |  => No Remark
CLM100099 |  => No Remark
CLM100044 |  => No Remark
CLM100014 |  => No Remark
CLM100062 |  => No Remark
CLM100010 |  => No Remark
CLM100012 |  => No Remark
CLM100029 |  => No Remark
CLM100053 |  => No Remark
