# Car classification



In [80]:
import pandas as pd

rdw_cars = pd.read_csv('/brand_model_peryear_cleaned_final.csv')
print(rdw_cars.head())
rdw_cars.info()

# Normalize column to lowercase for matching
rdw_cars['inrichting_std'] = rdw_cars['inrichting_std'].str.lower()

# Exclude non-passenger cars
exclude_keywords = [
    'open laadvloer', 'speciale groep', 'neerklapbare zijschotten', 'veewagen',
    'voor vervoer voertuigen', 'lijkwagen', 'huifopbouw', 'opleggertrekker',
    'detailhandel/expositiedoel.', 'kipper', 'gecond. met temperatuurreg.',
    'kraanwagen', 'hoogwerker'
]

# Remove excluded vehicle types
rdw_cars = rdw_cars[~rdw_cars['inrichting_std'].isin(exclude_keywords)]

print(f"Remaining cars after exclusion: {len(rdw_cars)}")
print("Remaining unique types:", rdw_cars['inrichting_std'].unique())

# Keyword list for the classification
camper_keywords = ['kampeerwagen']
suv_keywords = ['pick-up truck']
mpv_keywords = ['mpv', 'personenbus']
sports_keywords = ['cabriolet', 'coupe', 'sportwagen']

# Computing the groups based on mass
p33_mass = rdw_cars['mass_empty_median'].quantile(0.33)
p66_mass = rdw_cars['mass_empty_median'].quantile(0.66)

print(f"Global 33rd percentile mass: {p33_mass:.1f}")
print(f"Global 66th percentile mass: {p66_mass:.1f}")

# Car classification function
def classify_car(row):
    inrichting = str(row['inrichting_std']).lower()
    mass = row['mass_empty_median']
    length = row['length_median']

    # --- Camper Identification ---
    if inrichting in camper_keywords:
        return 'Camper'

    # --- SUV Identification ---
    elif inrichting in suv_keywords or (mass >= 1800 and length >= 4500):
        return 'SUV'

    # --- MPV Identification ---
    elif inrichting in mpv_keywords or row.get('seats_median', 0) >= 6:
        return 'MPV'

    # --- Sports Identification ---
    elif inrichting in sports_keywords or row.get('pw_ratio_median', 0) >= 0.12:
        return 'Sports'

    # --- Size-based Classification (Compact / Medium / Large) ---
    if mass < p33_mass:
        if length >= 4500:
            return 'Medium'
        else:
            return 'Compact'
    elif mass < p66_mass:
        return 'Medium'
    else:
        if length < 4400:
            return 'Medium'
        else:
            return 'Large'

# Applying the classification
rdw_cars['body_class'] = rdw_cars.apply(classify_car, axis=1)

# Review the outputs
print(rdw_cars['body_class'].value_counts())
print(rdw_cars.head(10))

# Count cars per body class
class_counts = rdw_cars['body_class'].value_counts()
print(class_counts)

# Save the updated DataFrame to a CSV file
rdw_cars.to_csv('rdw_cars_classified.csv', index=False)

from google.colab import files

files.download('rdw_cars_classified.csv')

  brand           model                       fuel_types_primary economy_rate  \
0  FORD            KUGA  Alcohol, Benzine, Diesel, Elektriciteit            A   
1  FORD            PUMA  Alcohol, Benzine, Diesel, Elektriciteit            B   
2  FORD  TRANSIT CUSTOM           Benzine, Diesel, Elektriciteit            A   
3  FORD           FOCUS  Alcohol, Benzine, Diesel, Elektriciteit            A   
4  FORD        EXPLORER                   Benzine, Elektriciteit            A   

   resold_flag   inrichting_std  seats_median  mass_empty_median  \
0            1              MPV           5.0             1744.0   
1            1              MPV           5.0             1225.0   
2            1  gesloten opbouw           3.0             1984.0   
3            1     stationwagen           5.0             1340.0   
4            1              MPV           5.0             1990.0   

   length_median  width_median  ...  count_2024  count_2025  avg_2023  \
0          463.0         188.0 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>