In [540]:
import pathlib
import pandas as pd
import numpy as np
from itertools import combinations
import os
from tqdm import tqdm
import requests

In [2]:
path = pathlib.Path().resolve()

# CT VIN by ZIP Dataset

In [3]:
ct_vin_data = pd.read_csv(path / "ignored-data" / "US_Yale_University_OP0001562727_NV_CT_VIN_Prefix_202212.txt", sep = '|')

In [50]:
ct_vin_data.head()

Unnamed: 0,everREPORT_YEAR_MONTH,MAKE,MODEL,MODEL_YEAR,TRANSACTION_PRICE,ZIP_CODE,VIN_PREFIX,COUNTY_NAME,STATE_ABBRV,VEH_COUNT,YEAR
0,201909,HONDA,CR-V,2019,25750.0,6461,2HKRW6H39K,NEW HAVEN,CT,1,2019
1,201902,HONDA,CIVIC,2019,17522.0,6512,2HGFC2F65K,NEW HAVEN,CT,1,2019
2,201909,KIA,FORTE,2019,,6078,3KPF24AD2K,HARTFORD,CT,1,2019
3,201911,LEXUS,RX,2020,55356.0,6037,2T2YGMDA2L,HARTFORD,CT,1,2019
4,201907,HONDA,CR-V,2019,25769.0,6084,2HKRW6H32K,TOLLAND,CT,1,2019


In [6]:
max_date = max(ct_vin_data["everREPORT_YEAR_MONTH"])
min_date = min(ct_vin_data["everREPORT_YEAR_MONTH"])
print(f"The data goes from {min_date} to {max_date}")

The data goes from 201801 to 202212


In [7]:
len(ct_vin_data)

924477

Count how many cars sold in 2018

In [30]:
ct_vin_data["YEAR"] = ct_vin_data["everREPORT_YEAR_MONTH"].astype(str).str[:4]

In [33]:
total_2018 = sum(ct_vin_data[ct_vin_data["YEAR"]=="2018"]["VEH_COUNT"])
print(total_2018)

265185


This seems reasonable... though perhaps high

**Attempt matching to VIN**

In [299]:
nhtsa_cleaned  = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")

  nhtsa_cleaned  = pd.read_csv(path / "ignored-data" / "NHTSA_cleaned.csv")


This is where the EV drive unit is stored

In [40]:
nhtsa_cleaned["EVDriveUnit"].unique()

array([nan, 'Single Motor', 'Dual Motor'], dtype=object)

And this is where the VINs are stored, with a check digit.

In [41]:
nhtsa_cleaned["VIN"].head()

0    19UYA416*3A
1    19UYA417*3A
2    19UYA424*3A
3    19UYA425*3A
4    19UYA426*3A
Name: VIN, dtype: object

Simplify this

In [45]:
nhtsa_cleaned_small = nhtsa_cleaned.loc[:, nhtsa_cleaned.columns.isin(['VIN', 'EVDriveUnit'])]

In [47]:
nhtsa_cleaned_small["EVDriveUnit"] = nhtsa_cleaned_small["EVDriveUnit"].fillna('non-ev')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nhtsa_cleaned_small["EVDriveUnit"] = nhtsa_cleaned_small["EVDriveUnit"].fillna('non-ev')


Make the nhtsa code ready for matching (we need to drop the final character)
**NOTE: THIS WILL LEAD TO DUPLICATES TECHNICALLY - WE NEED TO KEEP ONLY THE FIRST OCURRENCE OF THAT VIN**

In [129]:
pd.set_option('display.max_columns', None)

In [131]:
# So here the issue was that there was an error with the VIN, that was then corrected.
nhtsa_cleaned[nhtsa_cleaned["VIN"].str[0:10]=="2HKRW6H3*K"]

Unnamed: 0,vin_row_start,ABS,ActiveSafetySysNote,AdaptiveCruiseControl,AdaptiveDrivingBeam,AdaptiveHeadlights,AdditionalErrorText,AirBagLocCurtain,AirBagLocFront,AirBagLocKnee,AirBagLocSeatCushion,AirBagLocSide,AutoReverseSystem,AutomaticPedestrianAlertingSound,AxleConfiguration,Axles,BasePrice,BatteryA,BatteryA_to,BatteryCells,BatteryInfo,BatteryKWh,BatteryKWh_to,BatteryModules,BatteryPacks,BatteryType,BatteryV,BatteryV_to,BedLengthIN,BedType,BlindSpotIntervention,BlindSpotMon,BodyCabType,BodyClass,BrakeSystemDesc,BrakeSystemType,BusFloorConfigType,BusLength,BusType,CAN_AACN,CIB,CashForClunkers,ChargerLevel,ChargerPowerKW,CoolingType,CurbWeightLB,CustomMotorcycleType,DaytimeRunningLight,DestinationMarket,DisplacementCC,DisplacementCI,DisplacementL,Doors,DriveType,DriverAssist,DynamicBrakeSupport,EDR,ESC,EVDriveUnit,ElectrificationLevel,EngineConfiguration,EngineCycles,EngineCylinders,EngineHP,EngineHP_to,EngineKW,EngineManufacturer,EngineModel,EntertainmentSystem,ErrorCode,ErrorText,ForwardCollisionWarning,FuelInjectionType,FuelTypePrimary,FuelTypeSecondary,GCWR,GCWR_to,GVWR,GVWR_to,KeylessIgnition,LaneCenteringAssistance,LaneDepartureWarning,LaneKeepSystem,LowerBeamHeadlampLightSource,Make,MakeID,Manufacturer,ManufacturerId,Model,ModelID,ModelYear,MotorcycleChassisType,MotorcycleSuspensionType,NCSABodyType,NCSAMake,NCSAMapExcApprovedBy,NCSAMapExcApprovedOn,NCSAMappingException,NCSAModel,NCSANote,NonLandUse,Note,OtherBusInfo,OtherEngineInfo,OtherMotorcycleInfo,OtherRestraintSystemInfo,OtherTrailerInfo,ParkAssist,PedestrianAutomaticEmergencyBraking,PlantCity,PlantCompanyName,PlantCountry,PlantState,PossibleValues,Pretensioner,RearAutomaticEmergencyBraking,RearCrossTrafficAlert,RearVisibilitySystem,SAEAutomationLevel,SAEAutomationLevel_to,SeatBeltsAll,SeatRows,Seats,SemiautomaticHeadlampBeamSwitching,Series,Series2,SteeringLocation,SuggestedVIN,TPMS,TopSpeedMPH,TrackWidth,TractionControl,TrailerBodyType,TrailerLength,TrailerType,TransmissionSpeeds,TransmissionStyle,Trim,Trim2,Turbo,VIN,ValveTrainDesign,VehicleDescriptor,VehicleType,WheelBaseLong,WheelBaseShort,WheelBaseType,WheelSizeFront,WheelSizeRear,Wheels,Windows,CAid,EPAid
38636,84501,Standard,,,,,,1st and 2nd Rows,1st Row (Driver and Passenger),,,1st Row (Driver and Passenger),Standard,Standard,,2.0,25750.0,,,,,,,,,,,,,,,,,Sport Utility Vehicle (SUV)/Multi-Purpose Vehi...,,,Not Applicable,,Not Applicable,,,,,,Water,,Not Applicable,Standard,,2400.0,146.456986,2.4,5.0,4WD,,Standard,,Standard,,,In-Line,4.0,4.0,184.0,,137.2088,Honda,K24W9/K24V9,,6,6 - Incomplete VIN,,,Gasoline,,,,"Class 1C: 4,001 - 5,000 lb (1,814 - 2,268 kg)",,Standard,,,,,HONDA,474.0,HONDA,990,CR V,1865.0,2019,Not Applicable,Not Applicable,,,,,,,,,,,Direct Fuel Injection,,"seat belts: front, rear, rear center",,,,ALLISTON,,CANADA,ONTARIO,,,,,Standard,,,Manual,2.0,5.0,Standard,LX,,Left-Hand Drive (LHD),,Direct,,,Standard,Not Applicable,,Not Applicable,,Continuously Variable Transmission (CVT),,,,2HKRW6H3*KH,Dual Overhead Cam (DOHC),2HKRW6H3*KH,MULTIPURPOSE PASSENGER VEHICLE (MPV),,104.7,,17.0,17.0,4.0,,30301.0,11237.0
38637,84501,Standard,,,,,"In the Possible values section, the Numeric va...",1st and 2nd Rows,1st Row (Driver and Passenger),,,1st Row (Driver and Passenger),Standard,Standard,,2.0,25750.0,,,,,,,,,,,,,,,,,Sport Utility Vehicle (SUV)/Multi-Purpose Vehi...,,,Not Applicable,,Not Applicable,,,,,,Water,,Not Applicable,Standard,,2400.0,146.456986,2.4,5.0,4WD,,Standard,,Standard,,,In-Line,4.0,4.0,184.0,,137.2088,Honda,K24W9/K24V9,,4614,"4 - VIN corrected, error in one position only ...",,,Gasoline,,,,"Class 1C: 4,001 - 5,000 lb (1,814 - 2,268 kg)",,Standard,,,,,HONDA,474.0,HONDA,990,CR V,1865.0,2019,Not Applicable,Not Applicable,,,,,,,,,,,Direct Fuel Injection,,"seat belts: front, rear, rear center",,,,,,,,(11:ABCEGHLMSTUXY),,,,Standard,,,Manual,2.0,5.0,Standard,LX,,Left-Hand Drive (LHD),2HKRW6H3*K!,Direct,,,Standard,Not Applicable,,Not Applicable,,Continuously Variable Transmission (CVT),,,,2HKRW6H3*KN,Dual Overhead Cam (DOHC),2HKRW6H3*KN,MULTIPURPOSE PASSENGER VEHICLE (MPV),,104.7,,17.0,17.0,4.0,,30301.0,11237.0


In [143]:
nhtsa_cleaned[nhtsa_cleaned["VIN"].str[0:10]=="W1Y5EDHY*L"]

Unnamed: 0,vin_row_start,ABS,ActiveSafetySysNote,AdaptiveCruiseControl,AdaptiveDrivingBeam,AdaptiveHeadlights,AdditionalErrorText,AirBagLocCurtain,AirBagLocFront,AirBagLocKnee,AirBagLocSeatCushion,AirBagLocSide,AutoReverseSystem,AutomaticPedestrianAlertingSound,AxleConfiguration,Axles,BasePrice,BatteryA,BatteryA_to,BatteryCells,BatteryInfo,BatteryKWh,BatteryKWh_to,BatteryModules,BatteryPacks,BatteryType,BatteryV,BatteryV_to,BedLengthIN,BedType,BlindSpotIntervention,BlindSpotMon,BodyCabType,BodyClass,BrakeSystemDesc,BrakeSystemType,BusFloorConfigType,BusLength,BusType,CAN_AACN,CIB,CashForClunkers,ChargerLevel,ChargerPowerKW,CoolingType,CurbWeightLB,CustomMotorcycleType,DaytimeRunningLight,DestinationMarket,DisplacementCC,DisplacementCI,DisplacementL,Doors,DriveType,DriverAssist,DynamicBrakeSupport,EDR,ESC,EVDriveUnit,ElectrificationLevel,EngineConfiguration,EngineCycles,EngineCylinders,EngineHP,EngineHP_to,EngineKW,EngineManufacturer,EngineModel,EntertainmentSystem,ErrorCode,ErrorText,ForwardCollisionWarning,FuelInjectionType,FuelTypePrimary,FuelTypeSecondary,GCWR,GCWR_to,GVWR,GVWR_to,KeylessIgnition,LaneCenteringAssistance,LaneDepartureWarning,LaneKeepSystem,LowerBeamHeadlampLightSource,Make,MakeID,Manufacturer,ManufacturerId,Model,ModelID,ModelYear,MotorcycleChassisType,MotorcycleSuspensionType,NCSABodyType,NCSAMake,NCSAMapExcApprovedBy,NCSAMapExcApprovedOn,NCSAMappingException,NCSAModel,NCSANote,NonLandUse,Note,OtherBusInfo,OtherEngineInfo,OtherMotorcycleInfo,OtherRestraintSystemInfo,OtherTrailerInfo,ParkAssist,PedestrianAutomaticEmergencyBraking,PlantCity,PlantCompanyName,PlantCountry,PlantState,PossibleValues,Pretensioner,RearAutomaticEmergencyBraking,RearCrossTrafficAlert,RearVisibilitySystem,SAEAutomationLevel,SAEAutomationLevel_to,SeatBeltsAll,SeatRows,Seats,SemiautomaticHeadlampBeamSwitching,Series,Series2,SteeringLocation,SuggestedVIN,TPMS,TopSpeedMPH,TrackWidth,TractionControl,TrailerBodyType,TrailerLength,TrailerType,TransmissionSpeeds,TransmissionStyle,Trim,Trim2,Turbo,VIN,ValveTrainDesign,VehicleDescriptor,VehicleType,WheelBaseLong,WheelBaseShort,WheelBaseType,WheelSizeFront,WheelSizeRear,Wheels,Windows,CAid,EPAid
57012,124601,,,,,,,Driver Seat Only,1st Row (Driver and Passenger),,,1st Row (Driver and Passenger),,,,,,,,,,,,,,,,,,,,,,Cargo Van,,Hydraulic,Not Applicable,,Not Applicable,,,,,,,,Not Applicable,,,3000.0,183.071232,3.0,,2WD,,,,,,,,,,,,,,OM642,,6,6 - Incomplete VIN,,,Diesel,,,,"Class 1D: 5,001 - 6,000 lb (2,268 - 2,722 kg)",,,,,,,MERCEDES-BENZ,449.0,MERCEDES-BENZ,1023,Sprinter,1703.0,2020,Not Applicable,Not Applicable,,,,,,,,,Body Length:7367 mm,,,,,,,,CHARLESTON,,UNITED STATES (USA),SOUTH CAROLINA,,,,,,,,,,,,3500,907 (VS30),,,,,,,Not Applicable,,Not Applicable,,,,,,W1Y5EDHY*LT,,W1Y5EDHY*LT,TRUCK,,170.0,,,,,,31620.0,
57013,124601,,,,,,,Driver Seat Only,1st Row (Driver and Passenger),,,1st Row (Driver and Passenger),,,,,,,,,,,,,,,,,,,,,,Cargo Van,,Hydraulic,Not Applicable,,Not Applicable,,,,,,,,Not Applicable,,,3000.0,183.071232,3.0,,2WD,,,,,,,,,,,,,,OM642,,6,6 - Incomplete VIN,,,Diesel,,,,"Class 1D: 5,001 - 6,000 lb (2,268 - 2,722 kg)",,,,,,,MERCEDES-BENZ,449.0,MERCEDES-BENZ,1023,Sprinter,1703.0,2020,Not Applicable,Not Applicable,,,,,,,,,Body Length:7367 mm,,,,,,,,DÜSSELDORF,,GERMANY,,,,,,,,,,,,,3500,907 (VS30),,,,,,,Not Applicable,,Not Applicable,,,,,,W1Y5EDHY*LP,,W1Y5EDHY*LP,TRUCK,,170.0,,,,,,31620.0,


In [56]:
nhtsa_cleaned_small["VIN"] = nhtsa_cleaned_small["VIN"].str[0:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nhtsa_cleaned_small["VIN"] = nhtsa_cleaned_small["VIN"].str[0:10]


Make the RL Polk data ready for matching (by converting the check character to a "*"

In [64]:
ct_vin_data.head()

Unnamed: 0,everREPORT_YEAR_MONTH,MAKE,MODEL,MODEL_YEAR,TRANSACTION_PRICE,ZIP_CODE,VIN_PREFIX,COUNTY_NAME,STATE_ABBRV,VEH_COUNT,YEAR
0,201909,HONDA,CR-V,2019,25750.0,6461,2HKRW6H39K,NEW HAVEN,CT,1,2019
1,201902,HONDA,CIVIC,2019,17522.0,6512,2HGFC2F65K,NEW HAVEN,CT,1,2019
2,201909,KIA,FORTE,2019,,6078,3KPF24AD2K,HARTFORD,CT,1,2019
3,201911,LEXUS,RX,2020,55356.0,6037,2T2YGMDA2L,HARTFORD,CT,1,2019
4,201907,HONDA,CR-V,2019,25769.0,6084,2HKRW6H32K,TOLLAND,CT,1,2019


In [67]:
ct_vin_data["VIN_PREFIX_NEW"] = ct_vin_data["VIN_PREFIX"].str[0:8]+"*"+ct_vin_data["VIN_PREFIX"].str[9:10]

***
# Identify duplicate VINS in NHTSA dataset
**Note:**
* NHTSA uses the first 11 alpha-numerics with the check variable in the 9th place replaced by '\*': ie ABCEDFGH\*JK
* RL Polk just uses the first 10 alpha-numerics including the check variable. ie ABCEDFGHIJ
* Therefore, to merge on the NHTSA VIN dataset, we must first **drop the 11th alpha-numeric**. However, this **leads to duplicates in the NHTSA VIN list** since VINs that were previously separate now become identical.

To address this, we undertake a quality check to determine how much 10-long VINS within the same 11-long family differ from one another. If they do not differ on meaningful characteristics, then we can arbitrarily drop the 11th character and simply choose the first 10-long VIN matching that pattern

### Inspect those VINS that are not unique

In [363]:
# Get those rows that are not unique

# Create a copy of the initial list
nhtsa_cleaned_notunique = nhtsa_cleaned.copy(deep = True)

# Drop the 11th character of the VINs in the first list
nhtsa_cleaned_notunique["VIN"] = nhtsa_cleaned_notunique["VIN"].str[0:10]

# Create counter to sum
nhtsa_cleaned_notunique["counter"] = 1

# Group by VIN and take a count. The count of the column "counter" reflects the number of repeats
nhtsa_cleaned_notunique_grouped = nhtsa_cleaned_notunique.groupby("VIN").count().reset_index()

In [369]:
# View result
nhtsa_cleaned_notunique_grouped.head()

Unnamed: 0,VIN,vin_row_start,ABS,ActiveSafetySysNote,AdaptiveCruiseControl,AdaptiveDrivingBeam,AdaptiveHeadlights,AdditionalErrorText,AirBagLocCurtain,AirBagLocFront,AirBagLocKnee,AirBagLocSeatCushion,AirBagLocSide,AutoReverseSystem,AutomaticPedestrianAlertingSound,AxleConfiguration,Axles,BasePrice,BatteryA,BatteryA_to,BatteryCells,BatteryInfo,BatteryKWh,BatteryKWh_to,BatteryModules,BatteryPacks,BatteryType,BatteryV,BatteryV_to,BedLengthIN,BedType,BlindSpotIntervention,BlindSpotMon,BodyCabType,BodyClass,BrakeSystemDesc,BrakeSystemType,BusFloorConfigType,BusLength,BusType,CAN_AACN,CIB,CashForClunkers,ChargerLevel,ChargerPowerKW,CoolingType,CurbWeightLB,CustomMotorcycleType,DaytimeRunningLight,DestinationMarket,DisplacementCC,DisplacementCI,DisplacementL,Doors,DriveType,DriverAssist,DynamicBrakeSupport,EDR,ESC,EVDriveUnit,ElectrificationLevel,EngineConfiguration,EngineCycles,EngineCylinders,EngineHP,EngineHP_to,EngineKW,EngineManufacturer,EngineModel,EntertainmentSystem,ErrorCode,ErrorText,ForwardCollisionWarning,FuelInjectionType,FuelTypePrimary,FuelTypeSecondary,GCWR,GCWR_to,GVWR,GVWR_to,KeylessIgnition,LaneCenteringAssistance,LaneDepartureWarning,LaneKeepSystem,LowerBeamHeadlampLightSource,Make,MakeID,Manufacturer,ManufacturerId,Model,ModelID,ModelYear,MotorcycleChassisType,MotorcycleSuspensionType,NCSABodyType,NCSAMake,NCSAMapExcApprovedBy,NCSAMapExcApprovedOn,NCSAMappingException,NCSAModel,NCSANote,NonLandUse,Note,OtherBusInfo,OtherEngineInfo,OtherMotorcycleInfo,OtherRestraintSystemInfo,OtherTrailerInfo,ParkAssist,PedestrianAutomaticEmergencyBraking,PlantCity,PlantCompanyName,PlantCountry,PlantState,PossibleValues,Pretensioner,RearAutomaticEmergencyBraking,RearCrossTrafficAlert,RearVisibilitySystem,SAEAutomationLevel,SAEAutomationLevel_to,SeatBeltsAll,SeatRows,Seats,SemiautomaticHeadlampBeamSwitching,Series,Series2,SteeringLocation,SuggestedVIN,TPMS,TopSpeedMPH,TrackWidth,TractionControl,TrailerBodyType,TrailerLength,TrailerType,TransmissionSpeeds,TransmissionStyle,Trim,Trim2,Turbo,ValveTrainDesign,VehicleDescriptor,VehicleType,WheelBaseLong,WheelBaseShort,WheelBaseType,WheelSizeFront,WheelSizeRear,Wheels,Windows,CAid,EPAid,counter
0,19UDE2F3*G,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,1,1,1,1,0,0,0,0,0,0,1,0,1,1,0,1,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,1,1,1
1,19UDE2F3*H,3,3,3,0,3,0,1,3,3,0,0,3,0,0,0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,3,3,0,0,3,0,3,0,0,0,0,0,3,0,3,3,0,3,3,3,3,3,0,3,0,3,0,0,3,3,3,3,0,3,3,3,0,3,3,0,0,3,0,0,0,3,0,3,0,0,0,0,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,3,0,3,0,0,0,2,0,2,2,1,0,0,0,3,0,0,3,3,3,3,3,0,3,1,3,3,0,3,3,0,3,3,3,0,0,0,3,3,3,0,3,0,3,3,3,0,3,3,3
2,19UDE2F3*J,2,2,2,2,0,0,1,2,2,0,0,2,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,2,0,2,0,2,0,0,0,2,0,2,2,0,2,2,2,2,2,0,2,0,2,0,0,2,2,2,2,0,2,2,2,0,2,2,0,0,2,0,0,0,2,0,2,0,0,0,0,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,1,0,1,1,1,0,0,0,2,0,0,2,2,2,2,2,0,2,1,2,2,0,2,2,0,2,2,2,0,0,0,2,2,2,0,2,0,2,2,2,0,2,2,2
3,19UDE2F3*K,1,1,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,1,0,1,0,0,0,1,0,1,1,0,1,1,1,1,1,0,1,0,1,0,0,1,1,1,1,0,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,1,1,1,0,1,0,1,0,0,1,1,0,1,1,1,0,0,0,1,1,1,0,1,0,1,1,1,0,1,1,1
4,19UDE2F3*L,7,7,0,0,0,0,0,7,7,0,0,7,7,0,0,7,7,0,0,0,0,0,0,0,0,0,0,0,0,7,0,7,7,7,0,0,7,0,7,7,0,0,0,0,7,0,7,7,0,7,7,7,7,7,0,7,7,7,0,0,7,7,7,7,0,7,7,7,0,7,7,0,0,7,0,0,0,7,0,0,0,0,0,0,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0,0,0,7,0,7,0,7,0,0,0,7,0,7,7,0,0,0,0,7,0,0,7,7,7,7,7,0,7,0,7,0,0,7,7,0,7,7,7,0,0,0,7,7,7,0,7,0,0,0,7,0,7,0,7


In [370]:
# See how long this is
print(f"The length of the grouped dataset is {len(nhtsa_cleaned_notunique_grouped)}")

The length of the grouped dataset is 48645


In [None]:
# Create a list of the nonunique VINs

# To do this, we take all of the VINs with a counter of greater than 1.
# This is the list of 10-digit VINs, belonging to an 11-digit family, that has more than 1 member (i.e.;
# 10-digit VINs that have siblings)
vins_not_unique = list(nhtsa_cleaned_notunique_grouped[nhtsa_cleaned_notunique_grouped["counter"]>1]["VIN"])

In [371]:
# Look to see how many of these VINs there are
len(vins_not_unique)

12590

In [372]:
# Create a subset of the initial DF, containing only non-unique VINs
nhtsa_cleaned_notunique_only = nhtsa_cleaned_notunique.loc[nhtsa_cleaned_notunique.VIN.isin(vins_not_unique), :]
len(nhtsa_cleaned_notunique_only)

36081

### Explanation
We have cleaned the NHTSA dataframe to contain only entries for 10-digit VINs, belonging to multi-member 11-digit families. That is, when we shorten from an 11-digit to a 10-digit VIN by dropping the 11th digit, these are the VINs that now look identical to some other VIN.

We now investigate how similar the vehicle attributes are for these VINs

### Log differences between identical VINs

In [373]:
# Replace all NaNs with 0 to facilitate comparison later on
nhtsa_cleaned_notunique_only = nhtsa_cleaned_notunique_only.replace(np.nan,0)

In [367]:
# Create a file to record progress
outfh_path = path / "data" / "outputs" / "track_diffs.txt"
out_fh = open(outfh_path, "w" if not os.path.exists(outfh_path) else "a")

In [None]:
# This function will output a large text file, containing all the differences between
# Rows that share the same VIN

def diffs(df):
    index = 0
    num_indices = len(vins_not_unique)
    # Do this for every VIN in the non unique vins list 
    for vin in vins_not_unique:
        if (index % 500 == 0):
            print(f"Working on index number {index} of {num_indices}")
        
        # Extract the entries of the dataframe with that VIN
        entries = df[df["VIN"] == vin].reset_index()
        num_entries = len(entries)
        
        # Record it in the log file
        out_fh.write(f"Working on VIN number {vin}, number of entries: {num_entries}\n")
        
        # Only if that VIN is to be found
        if num_entries > 0:
            
            # Create a list of all possible combinations of rows
            combos = list(combinations(list(np.arange(0, num_entries)), 2))
            
            # Go column by column
            columns = list(entries.columns)
            columns.remove("index")
            columns.remove("vin_row_start")
            
            for column in columns:
                diff_count = 0
                cells = list(entries[column])
                
                # Compare combos
                for combo in combos:
                    if (cells[combo[0]] != cells[combo[1]]):
                        out_fh.write(f"... Difference in column {column}:\n")
                        out_fh.write(f"... ... Difference between {cells[combo[0]]} and {cells[combo[1]]}\n")
                        
            index +=1
    out_fh.close()

diffs(nhtsa_cleaned_notunique_only)

This text file shows that the majority of the differences between vehicles that share the same shortened 10-digit VIN  are aesthetic only. That is, they pertain mostly to the following columns:
* AdditionalErrorText
* ErrorCode
* ErrorText
* PlantState
* PlantCounty
* VehicleDescriptor

These differences mean we should be able to freely drop the 11th digit, and simply match on the 10 remaining (we can choose the first row for which that 10-digit VIN appears).

### Produce a shortened VIN list

In [477]:
# Create a new DF
nhtsa_for_merge = nhtsa_cleaned.copy(deep = True)

# Set VINs to 10-long
nhtsa_for_merge["VIN"] = nhtsa_cleaned["VIN"].str[0:10]

# Those with drop set to 0 are to be kept
nhtsa_for_merge["drop"] = 0

# Sort by VIN so we can check the row above
nhtsa_for_merge = nhtsa_for_merge.sort_values("VIN").reset_index()

In [478]:
# Loop through the rows and find if the VIN is the same as the VIN above
# If it is, marke it to be dropped
for index, row in tqdm(nhtsa_for_merge.iterrows()):
    if nhtsa_for_merge.iloc[index, nhtsa_for_merge.columns.get_loc("VIN")] == nhtsa_for_merge.iloc[index - 1,  nhtsa_for_merge.columns.get_loc("VIN")]:
        nhtsa_for_merge.iloc[index, nhtsa_for_merge.columns.get_loc("drop")] = 1

72136it [00:10, 7140.75it/s]


In [480]:
# Drop all marked as 1
nhtsa_for_merge = nhtsa_for_merge[nhtsa_for_merge["drop"]==0]

In [481]:
# Prepare the EV drive unit
nhtsa_for_merge["EVDriveUnit"] = nhtsa_for_merge["EVDriveUnit"].fillna('non-ev')

# Drop unnecessary columns
nhtsa_for_merge = nhtsa_for_merge.iloc[:, nhtsa_for_merge.columns.isin(["VIN", "EVDriveUnit", "FuelTypePrimary"])]

In [476]:
# Observe different fuel types
nhtsa_cleaned["FuelTypePrimary"].unique()

array(['Gasoline', nan, 'Electric', 'Diesel',
       'Flexible Fuel Vehicle (FFV)', 'Compressed Natural Gas (CNG)',
       'Ethanol (E85)', 'Liquefied Petroleum Gas (propane or LPG)',
       'Compressed Hydrogen/Hydrogen', 'Fuel Cell'], dtype=object)

***

# Match data

### Prepare Data

In [493]:
# Get CT Data
ct_vin_data_to_match = ct_vin_data.loc[:, ~ct_vin_data.columns.isin(["VIN_PREFIX", "TRANSACTION_PRICE"])]

In [494]:
# Check len
len(ct_vin_data_to_match)

924477

In [495]:
ct_vin_data_to_match.head()

Unnamed: 0,everREPORT_YEAR_MONTH,MAKE,MODEL,MODEL_YEAR,ZIP_CODE,COUNTY_NAME,STATE_ABBRV,VEH_COUNT,YEAR,VIN_PREFIX_NEW
0,201909,HONDA,CR-V,2019,6461,NEW HAVEN,CT,1,2019,2HKRW6H3*K
1,201902,HONDA,CIVIC,2019,6512,NEW HAVEN,CT,1,2019,2HGFC2F6*K
2,201909,KIA,FORTE,2019,6078,HARTFORD,CT,1,2019,3KPF24AD*K
3,201911,LEXUS,RX,2020,6037,HARTFORD,CT,1,2019,2T2YGMDA*L
4,201907,HONDA,CR-V,2019,6084,TOLLAND,CT,1,2019,2HKRW6H3*K


### Match

In [497]:
match_4 = ct_vin_data.merge(nhtsa_for_merge, how = 'left', left_on = "VIN_PREFIX_NEW", right_on = "VIN")

### Check quality
Note: We use [this](https://portal.ct.gov/-/media/DEEP/air/mobile/CHEAPR/EV-Reg-Fact-Sheet.pdf) fact sheet to check quality. 

####  Unmatched VINs

In [501]:
# First, check the number of rows that have not been matched
unmatched_count = match_4["VIN"].isna().sum()
unmatched_count

295950

In [None]:
# Get a list of unmatched vins
unmatched = list(match_4[match_4["VIN"].isna()]["VIN_PREFIX_NEW"].unique())

In [527]:
# The number of unique VIN Prefixes that we did not locate in our NHTSA database
len(unmatched)

9826

In [503]:
# Take an example unmatched VIN
match_4[match_4["VIN"].isna()].iloc[0,:]

everREPORT_YEAR_MONTH        201902
MAKE                          HONDA
MODEL                         CIVIC
MODEL_YEAR                     2019
TRANSACTION_PRICE           17522.0
ZIP_CODE                       6512
VIN_PREFIX               2HGFC2F65K
COUNTY_NAME               NEW HAVEN
STATE_ABBRV                      CT
VEH_COUNT                         1
YEAR                           2019
VIN_PREFIX_NEW           2HGFC2F6*K
EVDriveUnit                     NaN
FuelTypePrimary                 NaN
VIN                             NaN
Name: 1, dtype: object

In [506]:
# See if this VIN_PREFIX_NEW can be identified in the nhtsa_cleaned (i.e. RAW) dataset
nhtsa_cleaned[nhtsa_cleaned["VIN"].str[0:10]=="2HGFC2F6*K"]

Unnamed: 0,vin_row_start,ABS,ActiveSafetySysNote,AdaptiveCruiseControl,AdaptiveDrivingBeam,AdaptiveHeadlights,AdditionalErrorText,AirBagLocCurtain,AirBagLocFront,AirBagLocKnee,AirBagLocSeatCushion,AirBagLocSide,AutoReverseSystem,AutomaticPedestrianAlertingSound,AxleConfiguration,Axles,BasePrice,BatteryA,BatteryA_to,BatteryCells,BatteryInfo,BatteryKWh,BatteryKWh_to,BatteryModules,BatteryPacks,BatteryType,BatteryV,BatteryV_to,BedLengthIN,BedType,BlindSpotIntervention,BlindSpotMon,BodyCabType,BodyClass,BrakeSystemDesc,BrakeSystemType,BusFloorConfigType,BusLength,BusType,CAN_AACN,CIB,CashForClunkers,ChargerLevel,ChargerPowerKW,CoolingType,CurbWeightLB,CustomMotorcycleType,DaytimeRunningLight,DestinationMarket,DisplacementCC,DisplacementCI,DisplacementL,Doors,DriveType,DriverAssist,DynamicBrakeSupport,EDR,ESC,EVDriveUnit,ElectrificationLevel,EngineConfiguration,EngineCycles,EngineCylinders,EngineHP,EngineHP_to,EngineKW,EngineManufacturer,EngineModel,EntertainmentSystem,ErrorCode,ErrorText,ForwardCollisionWarning,FuelInjectionType,FuelTypePrimary,FuelTypeSecondary,GCWR,GCWR_to,GVWR,GVWR_to,KeylessIgnition,LaneCenteringAssistance,LaneDepartureWarning,LaneKeepSystem,LowerBeamHeadlampLightSource,Make,MakeID,Manufacturer,ManufacturerId,Model,ModelID,ModelYear,MotorcycleChassisType,MotorcycleSuspensionType,NCSABodyType,NCSAMake,NCSAMapExcApprovedBy,NCSAMapExcApprovedOn,NCSAMappingException,NCSAModel,NCSANote,NonLandUse,Note,OtherBusInfo,OtherEngineInfo,OtherMotorcycleInfo,OtherRestraintSystemInfo,OtherTrailerInfo,ParkAssist,PedestrianAutomaticEmergencyBraking,PlantCity,PlantCompanyName,PlantCountry,PlantState,PossibleValues,Pretensioner,RearAutomaticEmergencyBraking,RearCrossTrafficAlert,RearVisibilitySystem,SAEAutomationLevel,SAEAutomationLevel_to,SeatBeltsAll,SeatRows,Seats,SemiautomaticHeadlampBeamSwitching,Series,Series2,SteeringLocation,SuggestedVIN,TPMS,TopSpeedMPH,TrackWidth,TractionControl,TrailerBodyType,TrailerLength,TrailerType,TransmissionSpeeds,TransmissionStyle,Trim,Trim2,Turbo,VIN,ValveTrainDesign,VehicleDescriptor,VehicleType,WheelBaseLong,WheelBaseShort,WheelBaseType,WheelSizeFront,WheelSizeRear,Wheels,Windows,CAid,EPAid


In [508]:
# Take a second example unmatched VIN
match_4[match_4["VIN"].isna()].iloc[17000,:]

everREPORT_YEAR_MONTH        202012
MAKE                         SUBARU
MODEL                       IMPREZA
MODEL_YEAR                     2021
TRANSACTION_PRICE           22494.0
ZIP_CODE                       6498
VIN_PREFIX               4S3GKAB65M
COUNTY_NAME               MIDDLESEX
STATE_ABBRV                      CT
VEH_COUNT                         1
YEAR                           2020
VIN_PREFIX_NEW           4S3GKAB6*M
EVDriveUnit                     NaN
FuelTypePrimary                 NaN
VIN                             NaN
Name: 52279, dtype: object

In [511]:
# See if this VIN_PREFIX_NEW can be identified in the nhtsa_cleaned (i.e. RAW) dataset
nhtsa_cleaned[nhtsa_cleaned["VIN"].str[0:10]=="4S3GKAB6*M"]

Unnamed: 0,vin_row_start,ABS,ActiveSafetySysNote,AdaptiveCruiseControl,AdaptiveDrivingBeam,AdaptiveHeadlights,AdditionalErrorText,AirBagLocCurtain,AirBagLocFront,AirBagLocKnee,AirBagLocSeatCushion,AirBagLocSide,AutoReverseSystem,AutomaticPedestrianAlertingSound,AxleConfiguration,Axles,BasePrice,BatteryA,BatteryA_to,BatteryCells,BatteryInfo,BatteryKWh,BatteryKWh_to,BatteryModules,BatteryPacks,BatteryType,BatteryV,BatteryV_to,BedLengthIN,BedType,BlindSpotIntervention,BlindSpotMon,BodyCabType,BodyClass,BrakeSystemDesc,BrakeSystemType,BusFloorConfigType,BusLength,BusType,CAN_AACN,CIB,CashForClunkers,ChargerLevel,ChargerPowerKW,CoolingType,CurbWeightLB,CustomMotorcycleType,DaytimeRunningLight,DestinationMarket,DisplacementCC,DisplacementCI,DisplacementL,Doors,DriveType,DriverAssist,DynamicBrakeSupport,EDR,ESC,EVDriveUnit,ElectrificationLevel,EngineConfiguration,EngineCycles,EngineCylinders,EngineHP,EngineHP_to,EngineKW,EngineManufacturer,EngineModel,EntertainmentSystem,ErrorCode,ErrorText,ForwardCollisionWarning,FuelInjectionType,FuelTypePrimary,FuelTypeSecondary,GCWR,GCWR_to,GVWR,GVWR_to,KeylessIgnition,LaneCenteringAssistance,LaneDepartureWarning,LaneKeepSystem,LowerBeamHeadlampLightSource,Make,MakeID,Manufacturer,ManufacturerId,Model,ModelID,ModelYear,MotorcycleChassisType,MotorcycleSuspensionType,NCSABodyType,NCSAMake,NCSAMapExcApprovedBy,NCSAMapExcApprovedOn,NCSAMappingException,NCSAModel,NCSANote,NonLandUse,Note,OtherBusInfo,OtherEngineInfo,OtherMotorcycleInfo,OtherRestraintSystemInfo,OtherTrailerInfo,ParkAssist,PedestrianAutomaticEmergencyBraking,PlantCity,PlantCompanyName,PlantCountry,PlantState,PossibleValues,Pretensioner,RearAutomaticEmergencyBraking,RearCrossTrafficAlert,RearVisibilitySystem,SAEAutomationLevel,SAEAutomationLevel_to,SeatBeltsAll,SeatRows,Seats,SemiautomaticHeadlampBeamSwitching,Series,Series2,SteeringLocation,SuggestedVIN,TPMS,TopSpeedMPH,TrackWidth,TractionControl,TrailerBodyType,TrailerLength,TrailerType,TransmissionSpeeds,TransmissionStyle,Trim,Trim2,Turbo,VIN,ValveTrainDesign,VehicleDescriptor,VehicleType,WheelBaseLong,WheelBaseShort,WheelBaseType,WheelSizeFront,WheelSizeRear,Wheels,Windows,CAid,EPAid


#### Attempt to extract unmatched VINs from NHTSA API

In [647]:
resp_df_out = pd.DataFrame([], columns = ["EVDriveUnit", "FuelTypePrimary", "VIN"])

In [648]:
for vin in tqdm(unmatched):
    url = (f"https://vpic.nhtsa.dot.gov/api/vehicles/DecodeVin/{vin}?format=csv")
    resp_df = pd.read_csv(url)
    EVDriveUnit = resp_df.iloc[resp_df[resp_df["variable"]=="EV Drive Unit"].index,3].tolist()[0]
    FuelTypePrimary = resp_df.iloc[resp_df[resp_df["variable"]=="Fuel Type - Primary"].index,3].tolist()[0]
    VIN = f"API: {vin}"
    resp_df_add = pd.DataFrame([[EVDriveUnit, FuelTypePrimary, VIN]], columns = ["EVDriveUnit", "FuelTypePrimary", "VIN"])
    resp_df_out = pd.concat([resp_df_out, resp_df_add])

100%|██████████| 9826/9826 [51:21<00:00,  3.19it/s]  


In [650]:
resp_df_out.to_csv(path / "data" / "outputs" / "nhtsa_queries.csv")

Different fuel types

In [644]:
resp_df_out["FuelTypePrimary"].unique()

array(['Gasoline', 'Diesel', 'Electric'], dtype=object)

VINs for which there is no response / there is not detailed data available

In [652]:
no_response = resp_df_out[resp_df_out["FuelTypePrimary"].isna()]

In [654]:
no_response.head()

Unnamed: 0,EVDriveUnit,FuelTypePrimary,VIN
0,,,API: 4P1BAAFF*N
0,,,API: YV4A221K*L
0,,,API: YV4H600L*N
0,,,API: YV4A221L*N
0,,,API: YV4A221K*N


We need a pure VIN column to match on

In [655]:
resp_df_out["VIN_ONLY"] = resp_df_out["VIN"].str[5:]

#### Fix unmatched VINs

In [661]:
# We split match_4 into matched and unmatched segments
match_4_matched = match_4[~match_4["VIN"].isna()]
match_4_unmatched = match_4[match_4["VIN"].isna()]

In [662]:
# For the unmatched part, we merge in the data from the API
match_4_unmatched_fix = match_4_unmatched.merge(resp_df_out, how = 'left', left_on = 'VIN_PREFIX_NEW', right_on = 'VIN_ONLY')

In [663]:
# Length is still the same
len(match_4_unmatched_fix)

295950

In [668]:
# Check how many are still unmatched - 2030
len(match_4_unmatched_fix[match_4_unmatched_fix["FuelTypePrimary_y"].isna()])

2030

In [669]:
match_4_unmatched_fix.columns

Index(['everREPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'MODEL_YEAR',
       'TRANSACTION_PRICE', 'ZIP_CODE', 'VIN_PREFIX', 'COUNTY_NAME',
       'STATE_ABBRV', 'VEH_COUNT', 'YEAR', 'VIN_PREFIX_NEW', 'EVDriveUnit_x',
       'FuelTypePrimary_x', 'VIN_x', 'EVDriveUnit_y', 'FuelTypePrimary_y',
       'VIN_y', 'VIN_ONLY'],
      dtype='object')

In [671]:
# Remove unneeded columns
match_4_unmatched_fix = match_4_unmatched_fix.iloc[:, ~match_4_unmatched_fix.columns.isin(["EVDriveUnit_x", 
                                                                                           "FuelTypePrimary_x",
                                                                                           "VIN_x"])]

In [675]:
# Rename columns 
match_4_unmatched_fix = match_4_unmatched_fix.rename(columns = {"FuelTypePrimary_y" : "FuelTypePrimary",
                                       "VIN_y" : "VIN",
                                       "EVDriveUnit_y": "EVDriveUnit"})

In [677]:
# Drop the extra VIN column
match_4_unmatched_fix = match_4_unmatched_fix.drop("VIN_ONLY", axis = 1)

In [678]:
# Now concatenate these two dataframes
match_4_resolved = pd.concat([match_4_matched, match_4_unmatched_fix])

In [680]:
# Check they are the correct length
len(match_4_resolved) == len(match_4)

True

In [681]:
# Check the number of NAs has reduced significantly - yes
# Not exactly the same number, but close
len(match_4_resolved[match_4_resolved["FuelTypePrimary"].isna()])

3028

***

In [490]:
match_4[(match_4["FuelTypePrimary"]=="Electric")&(match_4["everREPORT_YEAR_MONTH"].astype(str).str[0:4]=="2020")]["VEH_COUNT"].sum()

1654

In [472]:
ct_vin_data[(ct_vin_data["MAKE"]=="CHEVROLET")&(ct_vin_data["MODEL"]=="BOLT")]

Unnamed: 0,everREPORT_YEAR_MONTH,MAKE,MODEL,MODEL_YEAR,TRANSACTION_PRICE,ZIP_CODE,VIN_PREFIX,COUNTY_NAME,STATE_ABBRV,VEH_COUNT,YEAR,VIN_PREFIX_NEW
675,201905,CHEVROLET,BOLT,2019,43362.0,6460,1G1FZ6S0XK,NEW HAVEN,CT,1,2019,1G1FZ6S0*K
2989,201909,CHEVROLET,BOLT,2019,16772.0,6492,1G1FY6S01K,NEW HAVEN,CT,1,2019,1G1FY6S0*K
3428,202211,CHEVROLET,BOLT,2023,38484.0,6443,1G1FZ6S0XP,NEW HAVEN,CT,1,2022,1G1FZ6S0*P
5794,202205,CHEVROLET,BOLT,2022,17385.0,6010,1G1FY6S08N,HARTFORD,CT,1,2022,1G1FY6S0*N
11686,202105,CHEVROLET,BOLT,2021,45084.0,6854,1G1FZ6S07M,FAIRFIELD,CT,1,2021,1G1FZ6S0*M
...,...,...,...,...,...,...,...,...,...,...,...,...
919711,202005,CHEVROLET,BOLT,2020,37664.0,6477,1G1FY6S08L,NEW HAVEN,CT,1,2020,1G1FY6S0*L
921040,202007,CHEVROLET,BOLT,2020,42200.0,6840,1G1FZ6S08L,FAIRFIELD,CT,1,2020,1G1FZ6S0*L
921699,202002,CHEVROLET,BOLT,2020,34044.0,6281,1G1FY6S0XL,WINDHAM,CT,1,2020,1G1FY6S0*L
922329,202012,CHEVROLET,BOLT,2020,42934.0,6877,1G1FX6S05L,FAIRFIELD,CT,1,2020,1G1FX6S0*L


In [462]:
match[match["EVDriveUnit"]!="non-ev"]

Unnamed: 0,everREPORT_YEAR_MONTH,MAKE,MODEL,MODEL_YEAR,TRANSACTION_PRICE,ZIP_CODE,VIN_PREFIX,COUNTY_NAME,STATE_ABBRV,VEH_COUNT,YEAR,VIN_PREFIX_NEW,EVDriveUnit,VIN
140355,202203,TESLA,MODEL Y,2022,69440.0,6412,7SAYGDEFXN,MIDDLESEX,CT,1,2022,7SAYGDEF*N,Dual Motor,7SAYGDEF*N
140356,202203,TESLA,MODEL Y,2022,69440.0,6412,7SAYGDEFXN,MIDDLESEX,CT,1,2022,7SAYGDEF*N,Dual Motor,7SAYGDEF*N
140357,202203,TESLA,MODEL Y,2022,69440.0,6412,7SAYGDEFXN,MIDDLESEX,CT,1,2022,7SAYGDEF*N,Dual Motor,7SAYGDEF*N
140358,202203,TESLA,MODEL Y,2022,69440.0,6412,7SAYGDEFXN,MIDDLESEX,CT,1,2022,7SAYGDEF*N,Dual Motor,7SAYGDEF*N
140359,202203,TESLA,MODEL Y,2022,69440.0,6412,7SAYGDEFXN,MIDDLESEX,CT,1,2022,7SAYGDEF*N,Dual Motor,7SAYGDEF*N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638068,202202,MERCEDES-BENZ,EQS,2022,,6830,W1KCG2DB3N,FAIRFIELD,CT,1,2022,W1KCG2DB*N,Single Motor,W1KCG2DB*N
1638069,202202,MERCEDES-BENZ,EQS,2022,117155.0,6001,W1KCG2DB8N,HARTFORD,CT,1,2022,W1KCG2DB*N,Single Motor,W1KCG2DB*N
1638070,202112,MERCEDES-BENZ,EQS,2022,122299.0,6074,W1KCG2DB9N,HARTFORD,CT,1,2021,W1KCG2DB*N,Single Motor,W1KCG2DB*N
1638071,202201,MERCEDES-BENZ,EQS,2022,26940.0,6070,W1KCG2DB4N,HARTFORD,CT,1,2022,W1KCG2DB*N,Single Motor,W1KCG2DB*N


In [467]:
match["VIN"].unique()

array(['2HKRW6H3*K', '2T2YGMDA*L', '5NMS3CAD*K', ..., '3C6UR5JJ*L',
       '5XYPK4A5*K', '1C6SRFPM*M'], dtype=object)

In [466]:
match[np.isnan(match["VIN"])]

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [122]:
nhtsa_cleaned_small[nhtsa_cleaned_small["VIN"]=="2HKRW6H3*K"]

Unnamed: 0,EVDriveUnit,VIN
38636,non-ev,2HKRW6H3*K
38637,non-ev,2HKRW6H3*K


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000015F30EA4100>

In [116]:
match[(match["VIN_PREFIX"]=="2HKRW6H39K") & (match["everREPORT_YEAR_MONTH"]==201909) ]

Unnamed: 0,everREPORT_YEAR_MONTH,MAKE,MODEL,MODEL_YEAR,TRANSACTION_PRICE,ZIP_CODE,VIN_PREFIX,COUNTY_NAME,STATE_ABBRV,VEH_COUNT,YEAR,VIN_PREFIX_NEW,EVDriveUnit,VIN
0,201909,HONDA,CR-V,2019,25750.0,6461,2HKRW6H39K,NEW HAVEN,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
1,201909,HONDA,CR-V,2019,25750.0,6461,2HKRW6H39K,NEW HAVEN,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
490,201909,HONDA,CR-V,2019,26448.0,6784,2HKRW6H39K,FAIRFIELD,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
491,201909,HONDA,CR-V,2019,26448.0,6784,2HKRW6H39K,FAIRFIELD,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
958,201909,HONDA,CR-V,2019,24273.0,6051,2HKRW6H39K,HARTFORD,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
959,201909,HONDA,CR-V,2019,24273.0,6051,2HKRW6H39K,HARTFORD,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
1764,201909,HONDA,CR-V,2019,17653.0,6461,2HKRW6H39K,NEW HAVEN,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
1765,201909,HONDA,CR-V,2019,17653.0,6461,2HKRW6H39K,NEW HAVEN,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
1902,201909,HONDA,CR-V,2019,,6051,2HKRW6H39K,HARTFORD,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
1903,201909,HONDA,CR-V,2019,,6051,2HKRW6H39K,HARTFORD,CT,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K


In [95]:
len(ct_vin_data)

924477

In [93]:
len(match)

1641585

In [84]:
match.columns

Index(['everREPORT_YEAR_MONTH', 'MAKE', 'MODEL', 'MODEL_YEAR',
       'TRANSACTION_PRICE', 'ZIP_CODE', 'VIN_PREFIX', 'COUNTY_NAME',
       'STATE_ABBRV', 'VEH_COUNT', 'YEAR', 'VIN_PREFIX_NEW', 'EVDriveUnit',
       'VIN'],
      dtype='object')

In [436]:
evs_sold = match_2[match_2["EVDriveUnit"]!="non-ev"]

In [437]:
evs_sold["VEH_COUNT"].sum()

1637

In [78]:
evs_sold[evs_sold["YEAR"]=="2022"]["VEH_COUNT"].sum()

25666

In [85]:
match_short = match.loc[:, match.columns.isin(["everREPORT_YEAR_MONTH", "MAKE", "MODEL", "MODEL_YEAR", "ZIP_CODE", "VIN_PREFIX", "COUNTY_NAME", "VEH_COUNT", "YEAR", 'VIN_PREFIX_NEW', 'EVDriveUnit',
       'VIN'])]

In [86]:
match_short

Unnamed: 0,everREPORT_YEAR_MONTH,MAKE,MODEL,MODEL_YEAR,ZIP_CODE,VIN_PREFIX,COUNTY_NAME,VEH_COUNT,YEAR,VIN_PREFIX_NEW,EVDriveUnit,VIN
0,201909,HONDA,CR-V,2019,6461,2HKRW6H39K,NEW HAVEN,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
1,201909,HONDA,CR-V,2019,6461,2HKRW6H39K,NEW HAVEN,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
2,201907,HONDA,CR-V,2019,6084,2HKRW6H32K,TOLLAND,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
3,201907,HONDA,CR-V,2019,6084,2HKRW6H32K,TOLLAND,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
4,201910,HONDA,CR-V,2019,6611,2HKRW6H33K,FAIRFIELD,1,2019,2HKRW6H3*K,non-ev,2HKRW6H3*K
...,...,...,...,...,...,...,...,...,...,...,...,...
1641580,202010,MERCEDES-BENZ,SPRINTER,2020,6078,W1Y5EDHY6L,HARTFORD,1,2020,W1Y5EDHY*L,non-ev,W1Y5EDHY*L
1641581,202010,MERCEDES-BENZ,SPRINTER,2020,6078,W1Y5EDHY6L,HARTFORD,1,2020,W1Y5EDHY*L,non-ev,W1Y5EDHY*L
1641582,202001,RAM,RAM,2020,6002,3C6UR5JJ4L,HARTFORD,1,2020,3C6UR5JJ*L,non-ev,3C6UR5JJ*L
1641583,202010,KIA,SORENTO,2019,6811,5XYPK4A50K,FAIRFIELD,1,2020,5XYPK4A5*K,non-ev,5XYPK4A5*K


In [453]:
match_2["EVDriveUnit"]

array(['non-ev', nan, 'Dual Motor', 'Single Motor'], dtype=object)

In [456]:
match_2[match_2["EVDriveUnit"].isin(["Dual Motor", "Single Motor"])]["VEH_COUNT"].sum()

1637

In [468]:
match_3 = ct_vin_data.merge(nhtsa_cleaned_small, how = 'left', left_on = "VIN_PREFIX_NEW", right_on = "VIN")

In [438]:
match_2 = ct_vin_data.merge(nhtsa_for_merge, how = 'left', left_on = "VIN_PREFIX_NEW", right_on = "VIN")

In [115]:
match = ct_vin_data.merge(nhtsa_cleaned_small, left_on = "VIN_PREFIX_NEW", right_on = "VIN")