In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('electric_vehicles_spec.csv')

In [3]:
df.head()

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,efficiency_wh_per_km,range_km,acceleration_0_100_s,...,towing_capacity_kg,cargo_volume_l,seats,drivetrain,segment,length_mm,width_mm,height_mm,car_body_type,source_url
0,Abarth,500e Convertible,155,37.8,Lithium-ion,192.0,235.0,156,225,7.0,...,0.0,185,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1904/Abarth-500e-C...
1,Abarth,500e Hatchback,155,37.8,Lithium-ion,192.0,235.0,149,225,7.0,...,0.0,185,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1903/Abarth-500e-H...
2,Abarth,600e Scorpionissima,200,50.8,Lithium-ion,102.0,345.0,158,280,5.9,...,0.0,360,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3057/Abarth-600e-S...
3,Abarth,600e Turismo,200,50.8,Lithium-ion,102.0,345.0,158,280,6.2,...,0.0,360,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3056/Abarth-600e-T...
4,Aiways,U5,150,60.0,Lithium-ion,,310.0,156,315,7.5,...,,496,5,FWD,JC - Medium,4680,1865,1700,SUV,https://ev-database.org/car/1678/Aiways-U5


In [4]:
print(df.columns)

Index(['brand', 'model', 'top_speed_kmh', 'battery_capacity_kWh',
       'battery_type', 'number_of_cells', 'torque_nm', 'efficiency_wh_per_km',
       'range_km', 'acceleration_0_100_s', 'fast_charging_power_kw_dc',
       'fast_charge_port', 'towing_capacity_kg', 'cargo_volume_l', 'seats',
       'drivetrain', 'segment', 'length_mm', 'width_mm', 'height_mm',
       'car_body_type', 'source_url'],
      dtype='object')


In [5]:
df.drop(
    columns=[
     'efficiency_wh_per_km', 'towing_capacity_kg','length_mm', 'width_mm', 'height_mm', 'fast_charge_port', 'cargo_volume_l', 'drivetrain', 'segment', 'car_body_type', 'source_url', 'seats'
    ],
    inplace=True,
    errors='ignore')
df.head()

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,range_km,acceleration_0_100_s,fast_charging_power_kw_dc
0,Abarth,500e Convertible,155,37.8,Lithium-ion,192.0,235.0,225,7.0,67.0
1,Abarth,500e Hatchback,155,37.8,Lithium-ion,192.0,235.0,225,7.0,67.0
2,Abarth,600e Scorpionissima,200,50.8,Lithium-ion,102.0,345.0,280,5.9,79.0
3,Abarth,600e Turismo,200,50.8,Lithium-ion,102.0,345.0,280,6.2,79.0
4,Aiways,U5,150,60.0,Lithium-ion,,310.0,315,7.5,78.0


In [6]:
df.columns.tolist()

['brand',
 'model',
 'top_speed_kmh',
 'battery_capacity_kWh',
 'battery_type',
 'number_of_cells',
 'torque_nm',
 'range_km',
 'acceleration_0_100_s',
 'fast_charging_power_kw_dc']

In [7]:
df.shape

(478, 10)

In [8]:
#data preprocessing

In [9]:
df.isnull().sum()

brand                          0
model                          1
top_speed_kmh                  0
battery_capacity_kWh           0
battery_type                   0
number_of_cells              202
torque_nm                      7
range_km                       0
acceleration_0_100_s           0
fast_charging_power_kw_dc      1
dtype: int64

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(267, 10)

In [12]:
#duplicate check

In [13]:
df.duplicated().sum()

np.int64(0)

In [14]:
df.drop_duplicates(inplace=True)

In [15]:
df.shape

(267, 10)

In [16]:
df.info

<bound method DataFrame.info of       brand                                  model  top_speed_kmh  \
0    Abarth                       500e Convertible            155   
1    Abarth                         500e Hatchback            155   
2    Abarth                    600e Scorpionissima            200   
3    Abarth                           600e Turismo            200   
6      Alfa          Romeo Junior Elettrica 54 kWh            150   
..      ...                                    ...            ...   
456   Volvo              EX90 Twin Motor (MY24-26)            180   
457   Volvo  EX90 Twin Motor Performance (MY24-26)            180   
468   Zeekr                     001 Long Range RWD            200   
469   Zeekr                    001 Performance AWD            200   
470   Zeekr                      001 Privilege AWD            200   

     battery_capacity_kWh battery_type  number_of_cells  torque_nm  range_km  \
0                    37.8  Lithium-ion            192.0    

In [17]:
#data analysis

In [18]:
for col in df.columns:
    print('Unique values of ' + col)
    print(df[col].unique())
    print("==========================")

Unique values of brand
['Abarth' 'Alfa' 'Alpine' 'Audi' 'BYD' 'CUPRA' 'Cadillac' 'Citroen' 'DS'
 'Dacia' 'Fiat' 'Ford' 'Genesis' 'Hyundai' 'Jaguar' 'Jeep' 'Kia' 'Lancia'
 'Lexus' 'Lucid' 'MG' 'Mercedes-Benz' 'NIO' 'Opel' 'Peugeot' 'Polestar'
 'Porsche' 'Renault' 'Skoda' 'Subaru' 'Tesla' 'Toyota' 'Volkswagen'
 'Volvo' 'Zeekr']
Unique values of model
['500e Convertible' '500e Hatchback' '600e Scorpionissima' '600e Turismo'
 'Romeo Junior Elettrica 54 kWh' 'Romeo Junior Elettrica 54 kWh Veloce'
 'A290 Electric 180 hp' 'A290 Electric 220 hp' 'A6 Avant e-tron'
 'A6 Avant e-tron performance' 'A6 Avant e-tron quattro'
 'A6 Sportback e-tron' 'A6 Sportback e-tron performance'
 'A6 Sportback e-tron quattro' 'Q4 Sportback e-tron 40'
 'Q4 Sportback e-tron 45' 'Q4 Sportback e-tron 45 quattro'
 'Q4 Sportback e-tron 55 quattro' 'Q4 e-tron 40' 'Q4 e-tron 45'
 'Q4 e-tron 45 quattro' 'Q4 e-tron 55 quattro' 'Q6 e-tron'
 'Q6 e-tron Sportback' 'Q6 e-tron Sportback performance'
 'Q6 e-tron Sportback quattro

In [19]:
brand_mapping = {brand: str(i+1) for i, brand in enumerate(df['brand'].unique())}
df['brand'] = df['brand'].replace(brand_mapping)

model_mapping = {val: str(i+1) for i, val in enumerate(df['model'].unique())}
df['model'] = df['model'].replace(model_mapping)

battery_type_mapping = {battery_type: str(i+1) for i, battery_type in enumerate(df['battery_type'].unique())}
df['battery_type'] = df['battery_type'].replace(battery_type_mapping)

In [20]:
df

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,range_km,acceleration_0_100_s,fast_charging_power_kw_dc
0,1,1,155,37.8,1,192.0,235.0,225,7.0,67.0
1,1,2,155,37.8,1,192.0,235.0,225,7.0,67.0
2,1,3,200,50.8,1,102.0,345.0,280,5.9,79.0
3,1,4,200,50.8,1,102.0,345.0,280,6.2,79.0
6,2,5,150,50.8,1,102.0,260.0,320,9.0,85.0
...,...,...,...,...,...,...,...,...,...,...
456,34,263,180,107.0,1,204.0,770.0,470,5.9,150.0
457,34,264,180,107.0,1,204.0,910.0,455,4.9,150.0
468,35,265,200,94.0,1,110.0,343.0,505,7.2,135.0
469,35,266,200,94.0,1,110.0,686.0,480,3.8,135.0


In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col].astype(str))
print(df.dtypes)

brand                          int64
model                          int64
top_speed_kmh                  int64
battery_capacity_kWh         float64
battery_type                   int64
number_of_cells              float64
torque_nm                    float64
range_km                       int64
acceleration_0_100_s         float64
fast_charging_power_kw_dc    float64
dtype: object


In [22]:
df.applymap(lambda x: isinstance(x, str)).any()

  df.applymap(lambda x: isinstance(x, str)).any()


brand                        False
model                        False
top_speed_kmh                False
battery_capacity_kWh         False
battery_type                 False
number_of_cells              False
torque_nm                    False
range_km                     False
acceleration_0_100_s         False
fast_charging_power_kw_dc    False
dtype: bool

In [23]:
X = input_data = df.drop(columns=['range_km'])
Y = output_data = df['range_km']

In [24]:
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

In [25]:
model = LinearRegression()

In [26]:
model.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [27]:
df = pd.get_dummies(df, drop_first=True)

In [28]:
from sklearn.model_selection import train_test_split
X = df.drop('range_km', axis=1)
y = df['range_km']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)


In [30]:
#mse
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

1122.4673047663605

In [31]:
#R2 Score
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8795343608303959

In [32]:
predict = model.predict(X_test_scaled)

In [33]:
predict

array([426.90601968, 423.59364167, 487.57865153, 301.49123935,
       498.74689859, 518.37057844, 290.07969855, 431.89468232,
       485.85984601, 356.97834868, 527.82317351, 452.91558959,
       431.90008155, 382.52548833, 477.27884466, 333.64922747,
       202.66385861, 407.44512043, 429.49738923, 510.74357078,
       501.86813614, 375.67444526, 488.3275443 , 284.53356343,
       383.38495045, 492.25148562, 527.81102526, 348.17213871,
       361.66214845, 521.646242  , 327.58249724, 282.43735564,
       313.69261051, 396.92353818, 413.00316804, 203.61615564,
       364.82088253, 410.74425685, 378.73323862, 352.24227228,
       354.33468324, 295.69067436, 287.93533022, 442.45929382,
       410.80499809, 170.88232402, 429.01403688, 461.71916405,
       524.51851163, 387.13832053, 428.48606162, 308.3462756 ,
       392.81283553, 482.60502373])

In [34]:
X_test.head(1)

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,acceleration_0_100_s,fast_charging_power_kw_dc
124,3,251,200,91.0,0,376.0,950.0,3.9,115.0


In [35]:
input_data_model = pd.DataFrame(
    [[3, 251, 200, 91.0, 0, 376.0, 950.0, 3.9, 115.0]],
    columns=['brand', 'model', 'top_speed_kmh', 'battery_capacity_kWh', 
             'battery_type', 'number_of_cells', 'torque_nm', 
             'acceleration_0_100_s', 'fast_charging_power_kw_dc']
)

input_scaled = scaler.transform(input_data_model)

prediction = model.predict(input_scaled)

In [36]:
prediction

array([426.90601968])

In [40]:
import pickle as pk

In [41]:
pk.dump(model,open('model.pkl','wb'))

In [43]:
pk.dump(scaler, open("scaler.pkl", "wb"))

In [44]:
df.columns

Index(['brand', 'model', 'top_speed_kmh', 'battery_capacity_kWh',
       'battery_type', 'number_of_cells', 'torque_nm', 'range_km',
       'acceleration_0_100_s', 'fast_charging_power_kw_dc'],
      dtype='object')

In [45]:
X_train.shape, X_test.shape

((213, 9), (54, 9))

In [46]:
df.head()

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,range_km,acceleration_0_100_s,fast_charging_power_kw_dc
0,0,0,155,37.8,0,192.0,235.0,225,7.0,67.0
1,0,111,155,37.8,0,192.0,235.0,225,7.0,67.0
2,0,190,200,50.8,0,102.0,345.0,280,5.9,79.0
3,0,201,200,50.8,0,102.0,345.0,280,6.2,79.0
6,11,212,150,50.8,0,102.0,260.0,320,9.0,85.0
