In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.head(3)

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,E.V_Type,CAFV,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,JTMAB3FV3P,Kitsap,Seabeck,WA,98380.0,2023,TOYOTA,RAV4 PRIME,PHEV,known,42.0,0.0,35.0,240684006,POINT (-122.8728334 47.5798304),PUGET SOUND ENERGY INC,53035090000.0
1,1N4AZ1CP6J,Kitsap,Bremerton,WA,98312.0,2018,NISSAN,LEAF,BEV,known,151.0,0.0,35.0,474183811,POINT (-122.6961203 47.5759584),PUGET SOUND ENERGY INC,53035080000.0
2,5YJ3E1EA4L,King,Seattle,WA,98101.0,2020,TESLA,MODEL 3,BEV,known,266.0,0.0,43.0,113120017,POINT (-122.3340795 47.6099315),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033010000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205439 entries, 0 to 205438
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   VIN (1-10)            205439 non-null  object 
 1   County                205436 non-null  object 
 2   City                  205436 non-null  object 
 3   State                 205439 non-null  object 
 4   Postal Code           205436 non-null  float64
 5   Model Year            205439 non-null  int64  
 6   Make                  205439 non-null  object 
 7   Model                 205438 non-null  object 
 8   E.V_Type              205439 non-null  object 
 9   CAFV                  205439 non-null  object 
 10  Electric Range        205431 non-null  float64
 11  Base MSRP             205431 non-null  float64
 12  Legislative District  204997 non-null  float64
 13  DOL Vehicle ID        205439 non-null  int64  
 14  Vehicle Location      205431 non-null  object 
 15  

In [5]:
df.describe()

Unnamed: 0,Postal Code,Model Year,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,2020 Census Tract
count,205436.0,205439.0,205431.0,205431.0,204997.0,205439.0,205436.0
mean,98177.97187,2020.960363,52.164342,922.670532,28.970848,227715600.0,52977040000.0
std,2419.037479,2.989059,88.075859,7761.753602,14.910052,72057370.0,1588435000.0
min,1731.0,1997.0,0.0,0.0,1.0,4469.0,1001020000.0
25%,98052.0,2019.0,0.0,0.0,17.0,193532400.0,53033010000.0
50%,98125.0,2022.0,0.0,0.0,33.0,238236800.0,53033030000.0
75%,98372.0,2023.0,48.0,0.0,42.0,261871800.0,53053070000.0
max,99577.0,2025.0,337.0,845000.0,49.0,479254800.0,56021000000.0


In [6]:
df.isna().sum()

VIN (1-10)                0
County                    3
City                      3
State                     0
Postal Code               3
Model Year                0
Make                      0
Model                     1
E.V_Type                  0
CAFV                      0
Electric Range            8
Base MSRP                 8
Legislative District    442
DOL Vehicle ID            0
Vehicle Location          8
Electric Utility          3
2020 Census Tract         3
dtype: int64

In [7]:
print(f"Duplicated values: {df.duplicated().sum()}")

Duplicated values: 0


In [8]:
## Data cleaning

In [9]:
categoric_imput = SimpleImputer(strategy='most_frequent')
numeric_imput = SimpleImputer(strategy='median')

df[['County','City','Model','Vehicle Location','Electric Utility']] = categoric_imput.fit_transform(df[['County','City','Model','Vehicle Location','Electric Utility']])
df[['Postal Code','Electric Range','Base MSRP','Legislative District','2020 Census Tract']] = numeric_imput.fit_transform(df[['Postal Code','Electric Range','Base MSRP','Legislative District','2020 Census Tract']])

In [12]:
print(f'Missing values: {df.isnull().sum().sum()}')
print(f'Duplicated values: {df.duplicated().sum()}')

Missing values: 0
Duplicated values: 0


In [13]:
df = df.drop(columns=['VIN (1-10)','DOL Vehicle ID'])

In [14]:
df.head(3)

Unnamed: 0,County,City,State,Postal Code,Model Year,Make,Model,E.V_Type,CAFV,Electric Range,Base MSRP,Legislative District,Vehicle Location,Electric Utility,2020 Census Tract
0,Kitsap,Seabeck,WA,98380.0,2023,TOYOTA,RAV4 PRIME,PHEV,known,42.0,0.0,35.0,POINT (-122.8728334 47.5798304),PUGET SOUND ENERGY INC,53035090000.0
1,Kitsap,Bremerton,WA,98312.0,2018,NISSAN,LEAF,BEV,known,151.0,0.0,35.0,POINT (-122.6961203 47.5759584),PUGET SOUND ENERGY INC,53035080000.0
2,King,Seattle,WA,98101.0,2020,TESLA,MODEL 3,BEV,known,266.0,0.0,43.0,POINT (-122.3340795 47.6099315),CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA),53033010000.0


In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
#Give every data a numerical label

In [18]:
df['County'] = encoder.fit_transform(df['County'])
df['City'] = encoder.fit_transform(df['City'])
df['State'] = encoder.fit_transform(df['State'])
df['Make'] = encoder.fit_transform(df['Make'])
df['Model'] = encoder.fit_transform(df['Model'])
df['E.V_Type'] = encoder.fit_transform(df['E.V_Type'])
df['CAFV'] = encoder.fit_transform(df['CAFV'])
df['Vehicle Location'] = encoder.fit_transform(df['Vehicle Location'])
df['Electric Utility'] = encoder.fit_transform(df['Electric Utility'])

In [19]:
df.head(3)

Unnamed: 0,County,City,State,Postal Code,Model Year,Make,Model,E.V_Type,CAFV,Electric Range,Base MSRP,Legislative District,Vehicle Location,Electric Utility,2020 Census Tract
0,88,608,42,98380.0,2023,38,115,1,0,42.0,0.0,35.0,633,71,53035090000.0
1,88,67,42,98312.0,2018,28,85,0,0,151.0,0.0,35.0,605,71,53035080000.0
2,86,611,42,98101.0,2020,36,87,0,0,266.0,0.0,43.0,480,55,53033010000.0


In [20]:
#sepreate input and output

x = df.drop(columns=['E.V_Type'])
y = df['E.V_Type']

In [24]:
x

Unnamed: 0,County,City,State,Postal Code,Model Year,Make,Model,CAFV,Electric Range,Base MSRP,Legislative District,Vehicle Location,Electric Utility,2020 Census Tract
0,88,608,42,98380.0,2023,38,115,0,42.0,0.0,35.0,633,71,5.303509e+10
1,88,67,42,98312.0,2018,28,85,0,151.0,0.0,35.0,605,71,5.303508e+10
2,86,611,42,98101.0,2020,36,87,0,266.0,0.0,43.0,480,55,5.303301e+10
3,86,611,42,98125.0,2014,28,85,0,84.0,0.0,46.0,460,55,5.303300e+10
4,183,767,42,98597.0,2017,7,20,0,238.0,0.0,20.0,566,71,5.306701e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205434,28,516,42,98847.0,2022,28,85,2,0.0,0.0,12.0,287,65,5.300796e+10
205435,170,211,42,98208.0,2023,12,54,2,0.0,0.0,44.0,413,71,5.306104e+10
205436,173,113,42,99004.0,2017,36,89,0,200.0,0.0,6.0,127,2,5.306301e+10
205437,86,716,42,98070.0,2018,36,87,0,215.0,0.0,34.0,528,72,5.303303e+10


In [23]:
y

0         1
1         0
2         0
3         0
4         0
         ..
205434    0
205435    0
205436    0
205437    0
205438    0
Name: E.V_Type, Length: 205439, dtype: int64

In [25]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [26]:
#transfer data to scale[0,1], in order to ML
scaler = MinMaxScaler()

x = scaler.fit_transform(x)

In [27]:
x

array([[0.43137255, 0.79063719, 0.95454545, ..., 0.6828479 , 0.97260274,
        0.94573045],
       [0.43137255, 0.08712614, 0.95454545, ..., 0.65264293, 0.97260274,
        0.94573025],
       [0.42156863, 0.79453836, 0.95454545, ..., 0.51779935, 0.75342466,
        0.94569257],
       ...,
       [0.84803922, 0.14694408, 0.95454545, ..., 0.13700108, 0.02739726,
        0.94623795],
       [0.42156863, 0.93107932, 0.95454545, ..., 0.56957929, 0.98630137,
        0.94569294],
       [0.84803922, 0.84525358, 0.95454545, ..., 0.12729234, 0.02739726,
        0.94623789]])

In [30]:
#spearate data to test set and train set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=30)

In [31]:
# Machine Learning Model

In [33]:
model_1 = LogisticRegression()
model_2 = SVC()
model_3 = KNeighborsClassifier()
model_4 = DecisionTreeClassifier()
model_5 = RandomForestClassifier()
model_6 = BaggingClassifier()
model_7 = ExtraTreesClassifier()
model_8 = AdaBoostClassifier()


In [34]:
#model 1 LogisticRegression
model_1.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
y_pred = model_1.predict(x_test)

In [36]:
print(confusion_matrix(y_test, y_pred))

[[48346   238]
 [  668 12380]]


In [38]:
# Calculate the accuracy score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9852998442367601


In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     48584
           1       0.98      0.95      0.96     13048

    accuracy                           0.99     61632
   macro avg       0.98      0.97      0.98     61632
weighted avg       0.99      0.99      0.99     61632



In [44]:
#model 2 Support Vector Classifier
model_2.fit(x_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [48]:
y_pred = model_2.predict(x_test)

In [49]:
print(confusion_matrix(y_test, y_pred))

[[48388   196]
 [    7 13041]]


In [50]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.996706256490135


In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     48584
           1       0.99      1.00      0.99     13048

    accuracy                           1.00     61632
   macro avg       0.99      1.00      1.00     61632
weighted avg       1.00      1.00      1.00     61632



In [52]:
#model 3 KNeighborsClassifier
model_3.fit(x_train,y_train)
y_pred = model_3.predict(x_test)

In [53]:
print(confusion_matrix(y_test, y_pred))

[[48393   191]
 [   44 13004]]


In [54]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9961870456905504


In [55]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     48584
           1       0.99      1.00      0.99     13048

    accuracy                           1.00     61632
   macro avg       0.99      1.00      0.99     61632
weighted avg       1.00      1.00      1.00     61632



In [56]:
#model 4 DecisionTreeClassifier
model_4.fit(x_train,y_train)
y_pred = model_4.predict(x_test)

In [57]:
print(confusion_matrix(y_test, y_pred))

[[48580     4]
 [    4 13044]]


In [58]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9998701973001038


In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     48584
           1       1.00      1.00      1.00     13048

    accuracy                           1.00     61632
   macro avg       1.00      1.00      1.00     61632
weighted avg       1.00      1.00      1.00     61632



In [60]:
#model 5 RandomForestClassifier
model_5.fit(x_train,y_train)
y_pred = model_5.predict(x_test)

In [61]:
print(confusion_matrix(y_test, y_pred))

[[48577     7]
 [    5 13043]]


In [62]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9998052959501558


In [63]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     48584
           1       1.00      1.00      1.00     13048

    accuracy                           1.00     61632
   macro avg       1.00      1.00      1.00     61632
weighted avg       1.00      1.00      1.00     61632



In [64]:
#model 6 BaggingClassifier
model_6.fit(x_train,y_train)
y_pred = model_6.predict(x_test)

In [65]:
print(confusion_matrix(y_test, y_pred))

[[48580     4]
 [    2 13046]]


In [66]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9999026479750779


In [67]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     48584
           1       1.00      1.00      1.00     13048

    accuracy                           1.00     61632
   macro avg       1.00      1.00      1.00     61632
weighted avg       1.00      1.00      1.00     61632



In [68]:
#model 7 ExtraTreesClassifier
model_7.fit(x_train,y_train)
y_pred = model_7.predict(x_test)

In [69]:
print(confusion_matrix(y_test, y_pred))

[[48575     9]
 [    7 13041]]


In [70]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9997403946002077


In [71]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     48584
           1       1.00      1.00      1.00     13048

    accuracy                           1.00     61632
   macro avg       1.00      1.00      1.00     61632
weighted avg       1.00      1.00      1.00     61632



In [72]:
#model 8 AdaBoostClassifier
model_8.fit(x_train,y_train)
y_pred = model_8.predict(x_test)

In [73]:
print(confusion_matrix(y_test, y_pred))

[[48580     4]
 [    6 13042]]


In [74]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9998377466251298


In [75]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     48584
           1       1.00      1.00      1.00     13048

    accuracy                           1.00     61632
   macro avg       1.00      1.00      1.00     61632
weighted avg       1.00      1.00      1.00     61632



In [76]:
# List of models:
models = [model_1, model_2, model_3, model_4, model_5,
          model_6, model_7, model_8]
models_names = ['LogisticRegression', 'SVC', 'KNeighborsClassifier', 'DecisionTreeClassifier',
                'RandomForestClassifier', 'BaggingClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier',
                ]

# Calculate train and test scores:
train_score = [model.score(x_train, y_train) for model in models]
test_score = [model.score(x_test, y_test) for model in models]

# Difference between training and testing ratio
ratio = []
for train, test in zip(train_score, test_score):
    result = train - test
    ratio.append(f'{result * 100:.2f}%')

# Measure model state:6
rate = []
for train, test in zip(train_score, test_score):
    if train <= 0.65 and test <= 0.65:
        rate.append('bad')
    elif train > test * 1.10:
        rate.append('overfite')
    elif train > 0.65 and train < 0.80 and test > 0.65 and test < 0.80:
        rate.append('middle')
    elif train >= 0.80 and test >= 0.80 and train < 1.00 and test < 1.00:
        rate.append('good')
    elif train >= 0.80 and test < 0.80:
        rate.append('high train, low test')
    else:
        rate.append('unknown')

# Create DataFrame
model_score = pd.DataFrame({
    'Model': models_names,
    'Train score': [f'{round(score * 100, 2)}%' for score in train_score],
    'Test score': [f'{round(score * 100, 2)}%' for score in test_score],
    'Ratio difference': ratio,
    'Evaluate model': rate,
})

# Show result:
model_score

Unnamed: 0,Model,Train score,Test score,Ratio difference,Evaluate model
0,LogisticRegression,98.54%,98.53%,0.01%,good
1,SVC,99.71%,99.67%,0.04%,good
2,KNeighborsClassifier,99.77%,99.62%,0.15%,good
3,DecisionTreeClassifier,100.0%,99.99%,0.01%,good
4,RandomForestClassifier,100.0%,99.98%,0.02%,good
5,BaggingClassifier,100.0%,99.99%,0.01%,good
6,ExtraTreesClassifier,100.0%,99.97%,0.03%,unknown
7,AdaBoostClassifier,100.0%,99.98%,0.01%,good
