In [68]:
import pandas as pd

In [69]:
df = pd.read_csv('../data/train.csv')

In [70]:
df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


## Data Shape

In [71]:
df.shape

(1200000, 21)

### Data Types & Non-Null Counts

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [73]:
df.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
count,1200000.0,1181295.0,1155051.0,1090328.0,1125924.0,835971.0,1199994.0,1062118.0,1199999.0,1200000.0
mean,599999.5,41.14556,32745.22,2.009934,25.61391,1.002689,9.569889,592.9244,5.018219,1102.545
std,346410.3,13.53995,32179.51,1.417338,12.20346,0.98284,5.776189,149.9819,2.594331,864.9989
min,0.0,18.0,1.0,0.0,2.012237,0.0,0.0,300.0,1.0,20.0
25%,299999.8,30.0,8001.0,1.0,15.91896,0.0,5.0,468.0,3.0,514.0
50%,599999.5,41.0,23911.0,2.0,24.57865,1.0,10.0,595.0,5.0,872.0
75%,899999.2,53.0,44634.0,3.0,34.52721,2.0,15.0,721.0,7.0,1509.0
max,1199999.0,64.0,149997.0,4.0,58.97591,9.0,19.0,849.0,9.0,4999.0


### Inference

| Column                   | Observations                                     | Potential issue              |
| ------------------------ | ------------------------------------------------ | ---------------------------- |
| **Age**                  | min = 18, max = 64 ✅ realistic                   | none                         |
| **Annual Income**        | mean ≈ 41 K, max ≈ 149 K ✅ plausible             | none                         |
| **Number of Dependents** | 0–4 ✅ logical                                    | none                         |
| **Health Score**         | 20 – ~589 ❌ *Possible outliers or scaling issue* | we’ll cap or normalize later |
| **Previous Claims**      | 0–9 ✅ realistic small integer                    | none                         |
| **Vehicle Age**          | 0–19 ✅ acceptable                                | none                         |
| **Credit Score**         | 300–849 ✅ matches real credit range              | none                         |
| **Insurance Duration**   | 1–9 ✅ maybe in years                             | none                         |
| **Premium Amount**       | 2 – 4999 ✅ plausible                             | none                         |


## Data Cleaning & Pre-processing

### Missing Values

In [74]:
# Missing value summary
missing = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing / len(df)) * 100
missing_summary = pd.DataFrame({'Missing Values': missing, 'Missing %': missing_percent})
print(missing_summary[missing_summary['Missing Values'] > 0])


                      Missing Values  Missing %
Previous Claims               364029  30.335750
Occupation                    358075  29.839583
Credit Score                  137882  11.490167
Number of Dependents          109672   9.139333
Customer Feedback              77824   6.485333
Health Score                   74076   6.173000
Annual Income                  44949   3.745750
Age                            18705   1.558750
Marital Status                 18529   1.544083
Vehicle Age                        6   0.000500
Insurance Duration                 1   0.000083


In [75]:
# Missing value summary
missing = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing / len(df)) * 100
missing_summary = pd.DataFrame({'Missing Values': missing, 'Missing %': missing_percent})
print(missing_summary[missing_summary['Missing Values'] > 0])

                      Missing Values  Missing %
Previous Claims               364029  30.335750
Occupation                    358075  29.839583
Credit Score                  137882  11.490167
Number of Dependents          109672   9.139333
Customer Feedback              77824   6.485333
Health Score                   74076   6.173000
Annual Income                  44949   3.745750
Age                            18705   1.558750
Marital Status                 18529   1.544083
Vehicle Age                        6   0.000500
Insurance Duration                 1   0.000083


### Remove age

In [76]:
# Safely replace invalid/missing Age values
df['Age'] = df['Age'].replace(['', 'NaN', 'unknown'], pd.NA)
df['Vehicle Age'] = df['Vehicle Age'].replace(['', 'NaN', 'unknown'], pd.NA)
df['Insurance Duration'] = df['Insurance Duration'].replace(['', 'NaN', 'unknown'], pd.NA)
# Now drop rows with missing Age
df = df.dropna(subset=['Age']).reset_index(drop=True)
df = df.dropna(subset=['Vehicle Age']).reset_index(drop=True)
df = df.dropna(subset=['Insurance Duration']).reset_index(drop=True)


In [77]:
df['Number of Dependents'] = df['Number of Dependents'].fillna(0)
df['Previous Claims'] = df['Previous Claims'].fillna(0)

In [80]:
# Categorical Columns
cat_fill_unknown = ['Occupation', 'Customer Feedback', 'Marital Status']
for col in cat_fill_unknown:
    df[col] = df[col].fillna('Unknown')

In [82]:
df['Credit Score'] = df['Credit Score'].fillna(df['Credit Score'].median())
df['Health Score'] = df['Health Score'].fillna(df['Health Score'].median())
df['Annual Income'] = df['Annual Income'].fillna(df['Annual Income'].median())


In [83]:
# Missing value summary|
missing = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing / len(df)) * 100
missing_summary = pd.DataFrame({'Missing Values': missing, 'Missing %': missing_percent})
print(missing_summary[missing_summary['Missing Values'] > 0])

Empty DataFrame
Columns: [Missing Values, Missing %]
Index: []


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1181288 entries, 0 to 1181287
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1181288 non-null  int64  
 1   Age                   1181288 non-null  float64
 2   Gender                1181288 non-null  object 
 3   Annual Income         1181288 non-null  float64
 4   Marital Status        1181288 non-null  object 
 5   Number of Dependents  1181288 non-null  float64
 6   Education Level       1181288 non-null  object 
 7   Occupation            1181288 non-null  object 
 8   Health Score          1181288 non-null  float64
 9   Location              1181288 non-null  object 
 10  Policy Type           1181288 non-null  object 
 11  Previous Claims       1181288 non-null  float64
 12  Vehicle Age           1181288 non-null  float64
 13  Credit Score          1181288 non-null  float64
 14  Insurance Duration    1181288 non-

In [86]:
TARGET_COL = 'Premium Amount'
x_train = df.drop(columns=[TARGET_COL])
y_train = df[TARGET_COL]

In [87]:
df['Marital Status'].unique()

array(['Married', 'Divorced', 'Single', 'Unknown'], dtype=object)