In [99]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, GridSearchCV
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

#Load the data

In [70]:
old_path = "/content/volkswagen_e_golf.csv"
new_path = "/content/new_volkswagen_e_golf.csv"


In [71]:
ds = pd.read_csv(filepath_or_buffer=old_path)
ds = ds[pd.notnull(obj=ds['quantity(kWh)'])]
ds = ds[pd.notnull(obj=ds['avg_speed(km/h)'])]
ds = ds[pd.notnull(obj=ds['consumption(kWh/100km)'])]
ds.to_csv(path_or_buf=new_path)

In [73]:
dataset = pd.read_csv(filepath_or_buffer=new_path)

In [126]:
dataset.shape

(3331, 19)

In [127]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3331 entries, 0 to 3330
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              3331 non-null   int64  
 1   manufacturer            3331 non-null   object 
 2   model                   3331 non-null   object 
 3   version                 3331 non-null   object 
 4   power(kW)               3331 non-null   int64  
 5   fuel_date               3331 non-null   object 
 6   trip_distance(km)       3331 non-null   object 
 7   quantity(kWh)           3331 non-null   float64
 8   fuel_type               3331 non-null   object 
 9   tire_type               3331 non-null   object 
 10  city                    3331 non-null   int64  
 11  motor_way               3331 non-null   int64  
 12  country_roads           3331 non-null   int64  
 13  driving_style           3331 non-null   object 
 14  consumption(kWh/100km)  3331 non-null   

In [80]:
dataset

Unnamed: 0.1,Unnamed: 0,manufacturer,model,version,power(kW),fuel_date,trip_distance(km),quantity(kWh),fuel_type,tire_type,city,motor_way,country_roads,driving_style,consumption(kWh/100km),A/C,park_heating,avg_speed(km/h),ecr_deviation
0,1,Volkswagen,Golf,e-Golf,85,06.02.2019,50,12.29,Electricity,Winter tires,0,0,1,Normal,15.5,0,1,47.0,-1.3
1,2,Volkswagen,Golf,e-Golf,85,05.02.2019,43,8.68,Electricity,Winter tires,0,1,1,Normal,18.0,0,1,58.0,1.2
2,3,Volkswagen,Golf,e-Golf,85,04.02.2019,44,1.50,Electricity,Winter tires,0,1,1,Normal,16.1,0,1,43.0,-0.7
3,4,Volkswagen,Golf,e-Golf,85,04.02.2019,76,14.44,Electricity,Winter tires,0,1,0,Normal,19.0,0,1,76.0,2.2
4,5,Volkswagen,Golf,e-Golf,85,03.02.2019,15,6.84,Electricity,Winter tires,1,0,0,Normal,16.1,0,1,23.0,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3326,3340,Volkswagen,Golf,e-Golf,85,20.12.2014,26,5.62,Electricity,Winter tires,1,1,1,Normal,21.6,1,0,45.0,4.8
3327,3341,Volkswagen,Golf,e-Golf,85,20.12.2014,77,14.25,Electricity,Winter tires,1,1,1,Normal,18.5,1,0,42.0,1.7
3328,3342,Volkswagen,Golf,e-Golf,85,19.12.2014,26,4.97,Electricity,Winter tires,1,1,1,Normal,19.1,1,0,23.0,2.3
3329,3343,Volkswagen,Golf,e-Golf,85,19.12.2014,20,4.76,Electricity,Winter tires,1,1,1,Fast,23.8,1,0,46.0,7.0


In [91]:
new_df = dataset.drop("consumption(kWh/100km)", axis=1)


In [92]:
new_df

Unnamed: 0.1,Unnamed: 0,manufacturer,model,version,power(kW),fuel_date,trip_distance(km),quantity(kWh),fuel_type,tire_type,city,motor_way,country_roads,driving_style,A/C,park_heating,avg_speed(km/h),ecr_deviation
0,1,Volkswagen,Golf,e-Golf,85,06.02.2019,50,12.29,Electricity,Winter tires,0,0,1,Normal,0,1,47.0,-1.3
1,2,Volkswagen,Golf,e-Golf,85,05.02.2019,43,8.68,Electricity,Winter tires,0,1,1,Normal,0,1,58.0,1.2
2,3,Volkswagen,Golf,e-Golf,85,04.02.2019,44,1.50,Electricity,Winter tires,0,1,1,Normal,0,1,43.0,-0.7
3,4,Volkswagen,Golf,e-Golf,85,04.02.2019,76,14.44,Electricity,Winter tires,0,1,0,Normal,0,1,76.0,2.2
4,5,Volkswagen,Golf,e-Golf,85,03.02.2019,15,6.84,Electricity,Winter tires,1,0,0,Normal,0,1,23.0,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3326,3340,Volkswagen,Golf,e-Golf,85,20.12.2014,26,5.62,Electricity,Winter tires,1,1,1,Normal,1,0,45.0,4.8
3327,3341,Volkswagen,Golf,e-Golf,85,20.12.2014,77,14.25,Electricity,Winter tires,1,1,1,Normal,1,0,42.0,1.7
3328,3342,Volkswagen,Golf,e-Golf,85,19.12.2014,26,4.97,Electricity,Winter tires,1,1,1,Normal,1,0,23.0,2.3
3329,3343,Volkswagen,Golf,e-Golf,85,19.12.2014,20,4.76,Electricity,Winter tires,1,1,1,Fast,1,0,46.0,7.0


In [106]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3331 entries, 0 to 3330
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         3331 non-null   int64  
 1   manufacturer       3331 non-null   object 
 2   model              3331 non-null   object 
 3   version            3331 non-null   object 
 4   power(kW)          3331 non-null   int64  
 5   fuel_date          3331 non-null   object 
 6   trip_distance(km)  3331 non-null   object 
 7   quantity(kWh)      3331 non-null   float64
 8   fuel_type          3331 non-null   object 
 9   tire_type          3331 non-null   object 
 10  city               3331 non-null   int64  
 11  motor_way          3331 non-null   int64  
 12  country_roads      3331 non-null   int64  
 13  driving_style      3331 non-null   object 
 14  A/C                3331 non-null   int64  
 15  park_heating       3331 non-null   int64  
 16  avg_speed(km/h)    3331 

In [110]:
column_name_to_convert = "trip_distance(km)"  # Replace with the actual column name
new_df['trip_distance(km)'] = dataset['trip_distance(km)'].str.replace(',', '').astype(float)

# Print the updated DataFrame
new_df

Unnamed: 0.1,Unnamed: 0,manufacturer,model,version,power(kW),fuel_date,trip_distance(km),quantity(kWh),fuel_type,tire_type,city,motor_way,country_roads,driving_style,A/C,park_heating,avg_speed(km/h),ecr_deviation
0,1,Volkswagen,Golf,e-Golf,85,06.02.2019,50.0,12.29,Electricity,Winter tires,0,0,1,Normal,0,1,47.0,-1.3
1,2,Volkswagen,Golf,e-Golf,85,05.02.2019,43.0,8.68,Electricity,Winter tires,0,1,1,Normal,0,1,58.0,1.2
2,3,Volkswagen,Golf,e-Golf,85,04.02.2019,44.0,1.50,Electricity,Winter tires,0,1,1,Normal,0,1,43.0,-0.7
3,4,Volkswagen,Golf,e-Golf,85,04.02.2019,76.0,14.44,Electricity,Winter tires,0,1,0,Normal,0,1,76.0,2.2
4,5,Volkswagen,Golf,e-Golf,85,03.02.2019,15.0,6.84,Electricity,Winter tires,1,0,0,Normal,0,1,23.0,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3326,3340,Volkswagen,Golf,e-Golf,85,20.12.2014,26.0,5.62,Electricity,Winter tires,1,1,1,Normal,1,0,45.0,4.8
3327,3341,Volkswagen,Golf,e-Golf,85,20.12.2014,77.0,14.25,Electricity,Winter tires,1,1,1,Normal,1,0,42.0,1.7
3328,3342,Volkswagen,Golf,e-Golf,85,19.12.2014,26.0,4.97,Electricity,Winter tires,1,1,1,Normal,1,0,23.0,2.3
3329,3343,Volkswagen,Golf,e-Golf,85,19.12.2014,20.0,4.76,Electricity,Winter tires,1,1,1,Fast,1,0,46.0,7.0


In [111]:
# Convert a specific column to integers
column_name_to_convert = "trip_distance(km)"  # Replace with the actual column name
new_df['trip_distance(km)'] =new_df['trip_distance(km)'].astype(int)

In [112]:

X = new_df.iloc[:, 6:].values
y = dataset["consumption(kWh/100km)"]

In [113]:
X[0]

array([50, 12.29, 'Electricity', 'Winter tires', 0, 0, 1, 'Normal', 0, 1,
       47.0, -1.3], dtype=object)

In [114]:
y

0       15.5
1       18.0
2       16.1
3       19.0
4       16.1
        ... 
3326    21.6
3327    18.5
3328    19.1
3329    23.8
3330    18.3
Name: consumption(kWh/100km), Length: 3331, dtype: float64

In [115]:
"""do the preprocessing tasks on the data"""
# encode categorical features
label_encoder_1 = LabelEncoder()
X[:, 2] = label_encoder_1.fit_transform(X[:, 2])
label_encoder_2 = LabelEncoder()
X[:, 3] = label_encoder_2.fit_transform(X[:, 3])
label_encoder_3 = LabelEncoder()
X[:, 7] = label_encoder_2.fit_transform(X[:, 7])


In [116]:
X[0]

array([50, 12.29, 0, 1, 0, 0, 1, 2, 0, 1, 47.0, -1.3], dtype=object)

In [117]:
# split the dataset into training-set and test-set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [118]:
sc = StandardScaler()
X_train = sc.fit_transform(X=X_train)
X_test = sc.fit_transform(X=X_test)

In [119]:
def regression_models(X_train, y_train):
    from sklearn.linear_model import LinearRegression
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.svm import SVR
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score

    # Using Linear Regression
    linear_reg = LinearRegression()
    linear_reg.fit(X_train, y_train)

    # Using K Neighbors Regressor
    knn_reg = KNeighborsRegressor(n_neighbors=5)
    knn_reg.fit(X_train, y_train)

    # Using Support Vector Regression (Linear Kernel)
    svr_linear = SVR(kernel='linear')
    svr_linear.fit(X_train, y_train)

    # Using Support Vector Regression (RBF Kernel)
    svr_rbf = SVR(kernel='rbf')
    svr_rbf.fit(X_train, y_train)

    # Using Decision Tree Regressor
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(X_train, y_train)

    # Using Random Forest Regressor
    forest_reg = RandomForestRegressor(n_estimators=10)
    forest_reg.fit(X_train, y_train)

    # Print model performance on the training data
    models = [linear_reg, knn_reg, svr_linear, svr_rbf, tree_reg, forest_reg]
    model_names = ['Linear Regression', 'K Neighbors Regressor', 'SVR Linear', 'SVR RBF', 'Decision Tree Regressor', 'Random Forest Regressor']

    for i, model in enumerate(models):
        y_pred = model.predict(X_train)
        r2 = r2_score(y_train, y_pred)
        print(f'[{i}] {model_names[i]} Training R-squared:', r2)

    return models


In [120]:
trained_models = regression_models(X_train, y_train)

[0] Linear Regression Training R-squared: 1.0
[1] K Neighbors Regressor Training R-squared: 0.9477852782813792
[2] SVR Linear Training R-squared: 0.9998066965785262
[3] SVR RBF Training R-squared: 0.8709468942962291
[4] Decision Tree Regressor Training R-squared: 1.0
[5] Random Forest Regressor Training R-squared: 0.9890423224921341


#Prediction using test dataset

In [125]:
from sklearn.metrics import r2_score
for i, model in enumerate(trained_models):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f'Model[{i}] R-squared: {r2:.4f}')
    print()

Model[0] R-squared: 0.9973

Model[1] R-squared: 0.8377

Model[2] R-squared: 0.9974

Model[3] R-squared: 0.7425

Model[4] R-squared: 0.9972

Model[5] R-squared: 0.9595



As per our prediction on test data  SVR Linear Training R-squared fits the best for our data