In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.columns

In [None]:
plt.figure(figsize=(12,6))
top_makes = df['Make'].value_counts().head(10)  # Top 10 EV manufacturers
sns.barplot(x=top_makes.index, y=top_makes.values, palette="Blues_r")
plt.title("Top 10 EV Manufacturers (2025)", fontsize=14)
plt.xlabel("Make")
plt.ylabel("Number of Vehicles")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x="Model Year", palette="Greens_r", order=sorted(df["Model Year"].unique()))
plt.title("EVs by Model Year", fontsize=14)
plt.xlabel("Year")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(6,6))
df['Electric Vehicle Type'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, 
                                                    colors=sns.color_palette("Set2"))
plt.title("Distribution of EV Types", fontsize=14)
plt.ylabel("")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['Electric Range'], bins=30, kde=True, color="darkblue")
plt.title("Distribution of Electric Range (miles)", fontsize=14)
plt.xlabel("Electric Range (miles)")
plt.ylabel("Number of Vehicles")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(data=df, x="Clean Alternative Fuel Vehicle (CAFV) Eligibility", palette="coolwarm")
plt.title("CAFV Eligibility Status", fontsize=14)
plt.xlabel("CAFV Eligibility")
plt.ylabel("Count")
plt.xticks(rotation=15)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
top_states = df['State'].value_counts().head(10)
sns.barplot(x=top_states.index, y=top_states.values, palette="Purples_r")
plt.title("Top 10 States by EV Population", fontsize=14)
plt.xlabel("State")
plt.ylabel("Number of Vehicles")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
top_cities = df['City'].value_counts().head(10)
sns.barplot(x=top_cities.index, y=top_cities.values, palette="Oranges_r")
plt.title("Top 10 Cities by EV Population", fontsize=14)
plt.xlabel("City")
plt.ylabel("Number of Vehicles")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
numeric_cols = ['Model Year', 'Electric Range', 'Base MSRP', 'Legislative District', '2020 Census Tract']
corr = df[numeric_cols].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Numeric Features)", fontsize=14)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [None]:
target = "Electric Vehicle Type"
X = df.drop(columns=[target])
y = df[target]

# Label Encoding
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

y_le = LabelEncoder()
y = y_le.fit_transform(y)

In [None]:
imputer = SimpleImputer(strategy="most_frequent")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200, solver='lbfgs', n_jobs=-1),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Random Forest": RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=50, max_depth=5),
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    "Naive Bayes": GaussianNB()
}

In [None]:
imputer = SimpleImputer(strategy="most_frequent")  # you can also try "mean" for numeric
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
# Encode target as well
y_le = LabelEncoder()
y = y_le.fit_transform(y)

# --- Step 3: Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200, solver='lbfgs'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

In [None]:
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds) * 100
    results.append((name, acc))

results_df = pd.DataFrame(results, columns=["Algorithm", "Accuracy (%)"])
print(results_df)