# EV Sales: 2010-2024

Date source: https://www.kaggle.com/datasets/willianoliveiragibin/ev-sales-2010-2024

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_raw = pd.read_csv("data/IEA Global EV Data 2024 new.csv")

In [3]:
df_raw.head()

Unnamed: 0,region,category,parameter,mode,powertrain,year,unit,value,percentage
0,Austria,Historical,EV stock,Cars,BEV,2010,Vehicles,350,"35000,00%"
1,Austria,Historical,EV stock share,Cars,EV,2010,percent,789.999.961.853,"78999996185300,00%"
2,Belgium,Historical,EV stock,Buses,BEV,2010,Vehicles,3,"300,00%"
3,Belgium,Historical,EV sales,Vans,BEV,2010,Vehicles,7,"700,00%"
4,Belgium,Historical,EV stock,Vans,BEV,2010,Vehicles,62,"6200,00%"


In [7]:
len(df_raw)

12654

In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12654 entries, 0 to 12653
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   region      12654 non-null  object
 1   category    12654 non-null  object
 2   parameter   12654 non-null  object
 3   mode        12654 non-null  object
 4   powertrain  12654 non-null  object
 5   year        12654 non-null  int64 
 6   unit        12654 non-null  object
 7   value       12654 non-null  object
 8   percentage  12654 non-null  object
dtypes: int64(1), object(8)
memory usage: 889.9+ KB


In [8]:
df_raw.describe()

Unnamed: 0,year
count,12654.0
mean,2019.822112
std,5.476494
min,2010.0
25%,2016.0
50%,2020.0
75%,2022.0
max,2035.0


#### Pointers
* There are no null values
* `value` and  `percentage` should be float64, but interpreted as object
* Also `percentage` seems to be in thousands and sometime millions
* The min for `year` is fine (2010), but max id 2035, while it should be 2024

## Data cleaning

In [9]:
df = df_raw.copy()

In [10]:
df.columns

Index(['region', 'category', 'parameter', 'mode', 'powertrain', 'year', 'unit',
       'value', 'percentage'],
      dtype='object')

In [17]:
categorical_cols =  ['region', 'category', 'parameter', 'mode', 'powertrain', 'unit']
numerical_cols = ['value', 'percentage']

In [14]:
for col in categorical_cols:
    df[col] = df[col].str.lower()
    print(f"{col} -> {df[col].unique()}")
    print("------------------------------------------")

region -> ['austria' 'belgium' 'brazil' 'canada' 'china' 'denmark' 'eu27' 'europe'
 'france' 'germany' 'iceland' 'india' 'israel' 'italy' 'japan' 'korea'
 'netherlands' 'new zealand' 'norway' 'poland' 'portugal'
 'rest of the world' 'spain' 'sweden' 'united kingdom' 'usa' 'world'
 'australia' 'chile' 'finland' 'mexico' 'switzerland' 'turkiye' 'greece'
 'south africa' 'bulgaria' 'colombia' 'costa rica' 'czech republic'
 'estonia' 'hungary' 'ireland' 'latvia' 'lithuania' 'romania' 'seychelles'
 'slovakia' 'slovenia' 'thailand' 'united arab emirates' 'croatia'
 'cyprus' 'luxembourg' 'indonesia']
------------------------------------------
category -> ['historical' 'projection-steps' 'projection-aps']
------------------------------------------
parameter -> ['ev stock' 'ev stock share' 'ev sales' 'ev sales share'
 'electricity demand' 'oil displacement mbd'
 'oil displacement, million lge' 'ev charging points']
------------------------------------------
mode -> ['cars' 'buses' 'vans' 'trucks

In [18]:
df["percentage"].apply(lambda a)

value -> ['350' '789.999.961.853' '3' ... '940000' '390000000' '9400000']
------------------------------------------
percentage -> ['35000,00%' '78999996185300,00%' '300,00%' ... '94000000,00%'
 '39000000000,00%' '940000000,00%']
------------------------------------------


In [22]:
np.float64('78999996185300,00%'[:-1].replace(",", ""))

7899999618530000.0

In [25]:
df[df['value'].str.contains("\.")]

Unnamed: 0,region,category,parameter,mode,powertrain,year,unit,value,percentage
1,austria,historical,ev stock share,cars,ev,2010,percent,789.999.961.853,"78999996185300,00%"
8,belgium,historical,ev sales share,cars,ev,2010,percent,9.999.999.776.483,"999999977648300,00%"
12,belgium,historical,ev stock share,buses,ev,2010,percent,18.999.999.389.052,"1899999938905200,00%"
13,belgium,historical,ev sales share,vans,ev,2010,percent,13.000.000.268.221,"1300000026822100,00%"
14,belgium,historical,ev stock share,vans,ev,2010,percent,10.999.999.940.395,"1099999994039500,00%"
...,...,...,...,...,...,...,...,...,...
12565,world,projection-aps,oil displacement mbd,cars,ev,2035,milion barrels per day,769.999.980.926.514,"76999998092651400,00%"
12574,world,projection-steps,oil displacement mbd,buses,ev,2035,milion barrels per day,509.999.990.463.257,"50999999046325700,00%"
12575,world,projection-steps,oil displacement mbd,trucks,ev,2035,milion barrels per day,1.5,"4541300,00%"
12576,world,projection-steps,oil displacement mbd,vans,ev,2035,milion barrels per day,910.000.026.226.044,"91000002622604400,00%"


#### Data issues
* There are a lot of instanced where value is something like 9.999.999.776.483
* Is it supposed to be decimal number (like 9999999776.483) or a integer with.being used in place of,(like9,999,999,776,483)