In [2]:
# notebooks/01_data_preprocessing.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)

print("=== INSURANCE DATA PREPROCESSING ===")
print("=" * 50)

# File path
file_path = '../data/raw/raw_data.txt'

# Count total rows first (for progress tracking)
print("Counting total rows...")
with open(file_path, 'r', encoding='utf-8') as f:
    total_rows = sum(1 for _ in f)
print(f"Total rows in file: {total_rows:,}")

# METHOD 1: Load first 1000 rows without engine='python'
print("\nLoading first 1000 rows to understand structure...")
try:
    df_sample = pd.read_csv(file_path, sep='|', nrows=1000)
    print("✅ Loaded with default engine (C)")
except Exception as e:
    print(f"Error with default engine: {e}")
    # Try with python engine but without low_memory
    df_sample = pd.read_csv(file_path, sep='|', nrows=1000, engine='python')
    print("✅ Loaded with python engine")

print(f"\nSample data shape: {df_sample.shape}")
print(f"Number of columns: {len(df_sample.columns)}")
print("\nFirst 3 rows:")
print(df_sample.head(3))
print("\nLast 3 rows:")
print(df_sample.tail(3))

=== INSURANCE DATA PREPROCESSING ===
Counting total rows...
Total rows in file: 1,000,099

Loading first 1000 rows to understand structure...
✅ Loaded with default engine (C)

Sample data shape: (1000, 52)
Number of columns: 52

First 3 rows:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered Citizenship          LegalType Title Language                 Bank      AccountType  MaritalStatus         Gender       Country Province  PostalCode MainCrestaZone SubCrestaZone          ItemType    mmcode        VehicleType  RegistrationYear           make  Model  Cylinders  cubiccapacity  kilowatts bodytype  NumberOfDoors VehicleIntroDate  CustomValueEstimate AlarmImmobiliser TrackingDevice  CapitalOutstanding          NewVehicle WrittenOff Rebuilt Converted  CrossBorder  NumberOfVehiclesInFleet  SumInsured TermFrequency  CalculatedPremiumPerTerm         ExcessSelected CoverCategory   CoverType            CoverGroup              Section                          Product Statut

Analyzing Column Strcuture

In [3]:
print("\n=== COLUMN STRUCTURE ANALYSIS ===")

# Show all column names with indices
print("Column indices and names:")
for i, col in enumerate(df_sample.columns):
    print(f"{i:2}: {col}")

# Based on your sample data and project description, let's create column mapping
# From the sample you provided, I can see the columns match the project description

expected_columns = [
    'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
    'IsVATRegistered', 'Citizenship', 'LegalType', 'Title',
    'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender',
    'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone',
    'ItemType', 'Mmcode', 'VehicleType', 'RegistrationYear', 'Make', 'Model',
    'Cylinders', 'Cubiccapacity', 'Kilowatts', 'Bodytype', 'NumberOfDoors',
    'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice',
    'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted',
    'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency',
    'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType',
    'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType',
    'TotalPremium', 'TotalClaims'
]

print(f"\nExpected columns from project brief: {len(expected_columns)}")
print(f"Actual columns in sample data: {len(df_sample.columns)}")

if len(df_sample.columns) == len(expected_columns):
    print("\n✅ Column count matches! Applying column names...")
    df_sample.columns = expected_columns
    print("\nUpdated column names:")
    print(df_sample.columns.tolist())
else:
    print(f"\n⚠️ Column count mismatch!")
    print(f"Expected: {len(expected_columns)}, Got: {len(df_sample.columns)}")
    
    # Let's examine the actual data structure
    print("\nFirst row with indices:")
    first_row = df_sample.iloc[0]
    for i, val in enumerate(first_row):
        print(f"{i:2}: {str(val)[:50]}...")
    
    # Try to match columns manually
    print("\nTrying to match columns...")
    # Based on your sample data, it looks like columns match
    if len(df_sample.columns) >= 50:  # Close enough to expected
        print("Close match found, applying expected column names...")
        # Use as many expected columns as we have
        df_sample.columns = expected_columns[:len(df_sample.columns)]
        print(f"Applied {len(df_sample.columns)} column names")


=== COLUMN STRUCTURE ANALYSIS ===
Column indices and names:
 0: UnderwrittenCoverID
 1: PolicyID
 2: TransactionMonth
 3: IsVATRegistered
 4: Citizenship
 5: LegalType
 6: Title
 7: Language
 8: Bank
 9: AccountType
10: MaritalStatus
11: Gender
12: Country
13: Province
14: PostalCode
15: MainCrestaZone
16: SubCrestaZone
17: ItemType
18: mmcode
19: VehicleType
20: RegistrationYear
21: make
22: Model
23: Cylinders
24: cubiccapacity
25: kilowatts
26: bodytype
27: NumberOfDoors
28: VehicleIntroDate
29: CustomValueEstimate
30: AlarmImmobiliser
31: TrackingDevice
32: CapitalOutstanding
33: NewVehicle
34: WrittenOff
35: Rebuilt
36: Converted
37: CrossBorder
38: NumberOfVehiclesInFleet
39: SumInsured
40: TermFrequency
41: CalculatedPremiumPerTerm
42: ExcessSelected
43: CoverCategory
44: CoverType
45: CoverGroup
46: Section
47: Product
48: StatutoryClass
49: StatutoryRiskType
50: TotalPremium
51: TotalClaims

Expected columns from project brief: 52
Actual columns in sample data: 52

✅ Column c