In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

#### Texas EV Vehicle Registration: Raw Dataset

In [2]:
# Read the data for EV registrations in Texas
df = pd.read_csv(
    Path("../../../../data/raw_data/TX_EV_Registrations.csv"),
    dtype={7 : str},
    parse_dates=["Registration Date"],
    date_format="%m/%d/%Y"
)

print("Shape:", df.shape)
display(df.head())

Shape: (2274866, 13)


Unnamed: 0,State,ZIP Code,Registration Date,Vehicle Make,Vehicle Model,Vehicle Model Year,Drivetrain Type,Vehicle GVWR Class,Vehicle GVWR Category,Vehicle Count,DMV Snapshot ID,DMV Snapshot (Date),Latest DMV Snapshot Flag
0,TX,75002,2023-08-01,TESLA,MODEL 3,2021,BEV,1,Light-Duty (Class 1-2A),1,7,DMV Snapshot (10/1/2023),False
1,TX,77090,2023-08-01,TESLA,MODEL 3,2021,BEV,1,Light-Duty (Class 1-2A),1,7,DMV Snapshot (10/1/2023),False
2,TX,76053,2023-08-01,TESLA,MODEL 3,2021,BEV,1,Light-Duty (Class 1-2A),1,7,DMV Snapshot (10/1/2023),False
3,TX,76031,2023-08-01,TESLA,MODEL 3,2021,BEV,1,Light-Duty (Class 1-2A),1,7,DMV Snapshot (10/1/2023),False
4,TX,76028,2023-08-01,TESLA,MODEL 3,2021,BEV,1,Light-Duty (Class 1-2A),1,7,DMV Snapshot (10/1/2023),False


In [3]:
# Familiarize with the columns
for index, value in enumerate(df.columns):
    print(index, value)

0 State
1 ZIP Code
2 Registration Date
3 Vehicle Make
4 Vehicle Model
5 Vehicle Model Year
6 Drivetrain Type
7 Vehicle GVWR Class
8 Vehicle GVWR Category
9 Vehicle Count
10 DMV Snapshot ID
11 DMV Snapshot (Date)
12 Latest DMV Snapshot Flag


In [4]:
# Familiarize with the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2274866 entries, 0 to 2274865
Data columns (total 13 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   State                     object        
 1   ZIP Code                  object        
 2   Registration Date         datetime64[ns]
 3   Vehicle Make              object        
 4   Vehicle Model             object        
 5   Vehicle Model Year        int64         
 6   Drivetrain Type           object        
 7   Vehicle GVWR Class        object        
 8   Vehicle GVWR Category     object        
 9   Vehicle Count             int64         
 10  DMV Snapshot ID           int64         
 11  DMV Snapshot (Date)       object        
 12  Latest DMV Snapshot Flag  bool          
dtypes: bool(1), datetime64[ns](1), int64(3), object(8)
memory usage: 210.4+ MB


#### Texas EV Vehicle Registration: Processed Dataset

In [5]:
# Create a new DataFrame with only the columns needed for the analysis
select_columns = ["Registration Date", "State", "ZIP Code", "Vehicle Make", "Vehicle Model", "Vehicle Model Year"]

df_clean = df.loc[:, select_columns]

print("Shape:", df_clean.shape)
display(df_clean.head())

Shape: (2274866, 6)


Unnamed: 0,Registration Date,State,ZIP Code,Vehicle Make,Vehicle Model,Vehicle Model Year
0,2023-08-01,TX,75002,TESLA,MODEL 3,2021
1,2023-08-01,TX,77090,TESLA,MODEL 3,2021
2,2023-08-01,TX,76053,TESLA,MODEL 3,2021
3,2023-08-01,TX,76031,TESLA,MODEL 3,2021
4,2023-08-01,TX,76028,TESLA,MODEL 3,2021


In [6]:
# Rename columns to manageable names

df_clean = df_clean.rename(columns={
    "Registration Date": "registration_date",
    "State": "state",
    "ZIP Code": "zip_code",
    "Vehicle Make": "make",
    "Vehicle Model": "model",
    "Vehicle Model Year": "year"
})

print("Shape:", df_clean.shape)
display(df_clean.head())

Shape: (2274866, 6)


Unnamed: 0,registration_date,state,zip_code,make,model,year
0,2023-08-01,TX,75002,TESLA,MODEL 3,2021
1,2023-08-01,TX,77090,TESLA,MODEL 3,2021
2,2023-08-01,TX,76053,TESLA,MODEL 3,2021
3,2023-08-01,TX,76031,TESLA,MODEL 3,2021
4,2023-08-01,TX,76028,TESLA,MODEL 3,2021


In [7]:
# View data types before saving the cleaned data
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2274866 entries, 0 to 2274865
Data columns (total 6 columns):
 #   Column             Dtype         
---  ------             -----         
 0   registration_date  datetime64[ns]
 1   state              object        
 2   zip_code           object        
 3   make               object        
 4   model              object        
 5   year               int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 104.1+ MB


In [8]:
# Count missing values in each column
df_clean.isnull().sum()

registration_date    0
state                0
zip_code             0
make                 0
model                0
year                 0
dtype: int64

In [9]:
# Export to a new CSV file
file_name = "tx_ev_registrations.csv"
file_path = Path(f"../../../../data/processed_data/{file_name}")

if file_path.exists():
    print("File already exists. Overwriting file.")
    file_path.unlink()

if not file_path.exists():
    print("Creating file...")
    df_clean.to_csv(file_path, index=False)
    print(f"File saved as {file_name}")

File already exists. Overwriting file.
Creating file...
File saved as tx_ev_registrations.csv
