In [None]:
# Electric Vehicle Data Analysis Assignment

# Title: EV Data Analysis - Washington State
# Author: [SHWETA TRIPATHI]
# Date: [25-05-2025]
#Course: [Data Analysis with Python – GrowAI]


In [None]:
# Importing Libraries and Loading Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")



In [None]:
# Section 1: Data Cleaning
# Replace 0 with NaN in Electric Range and Base MSRP
df['Electric Range'].replace(0, np.nan, inplace=True)
df['Base MSRP'].replace(0, np.nan, inplace=True)

# Drop duplicates
df.drop_duplicates(inplace=True)

# Anonymize VINs while keeping uniqueness
df['VIN (1-10)'] = df['VIN (1-10)'].astype('category').cat.codes

# Parse Vehicle Location into Longitude and Latitude
def split_location(location):
    try:
        lat_long = location.strip("()").split(",")
        return float(lat_long[0]), float(lat_long[1])
    except:
        return np.nan, np.nan

df[['Longitude', 'Latitude']] = df['Vehicle Location'].apply(
    lambda x: pd.Series(split_location(x)) if pd.notnull(x) else pd.Series([np.nan, np.nan])
)

In [None]:
# Section 2: Exploratory Data Analysis
# Top 5 Makes and Models
top_makes = df['Make'].value_counts().head(5)
top_models = df['Model'].value_counts().head(5)

# County distribution
county_distribution = df['County'].value_counts().head(10)

# EVs by model year
model_year_distribution = df['Model Year'].value_counts().sort_index()

# CAFV eligibility
cafv_eligibility = df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].value_counts()

# Average Electric Range and MSRP by model
avg_range = df['Electric Range'].mean()
avg_msrp_by_model = df.groupby('Model')['Base MSRP'].mean().dropna().sort_values(ascending=False).head(10)

In [None]:
# Section 3: Visualizations
sns.set(style="whitegrid")
fig, axs = plt.subplots(3, 2, figsize=(16, 16))
fig.suptitle("EV Data Visualizations", fontsize=18)

sns.barplot(x=top_makes.values, y=top_makes.index, ax=axs[0, 0])
axs[0, 0].set_title("Top 5 EV Makes")

sns.barplot(x=top_models.values, y=top_models.index, ax=axs[0, 1])
axs[0, 1].set_title("Top 5 EV Models")

sns.barplot(x=county_distribution.values, y=county_distribution.index, ax=axs[1, 0])
axs[1, 0].set_title("Top 10 Counties by EV Count")

sns.lineplot(x=model_year_distribution.index, y=model_year_distribution.values, ax=axs[1, 1])
axs[1, 1].set_title("EVs by Model Year")

axs[2, 0].pie(cafv_eligibility.values, labels=cafv_eligibility.index, autopct='%1.1f%%')
axs[2, 0].set_title("CAFV Eligibility Distribution")

sns.barplot(x=avg_msrp_by_model.values, y=avg_msrp_by_model.index, ax=axs[2, 1])
axs[2, 1].set_title("Top 10 Models by Avg. MSRP")

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

In [None]:

# Section 4: Linear Regression
model_data = df[['Electric Range', 'Base MSRP', 'Model Year', 'Make']].dropna()
X = model_data[['Base MSRP', 'Model Year', 'Make']]
y = model_data['Electric Range']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Make'])
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")




In [None]:
# Section : Conclusion
# The dataset shows strong growth in EV registrations post-2018, dominated by Tesla models. CAFV eligibility is high,
# and MSRP shows influence on predicted range. Linear regression yielded an R² score indicating [Insert Interpretation Here].
