In [2]:
import pandas as pd
df = pd.read_csv('electeric_clean.csv')
df.drop(columns='Unnamed: 0',inplace=True,axis=1)
df.head(1)

Unnamed: 0,Battery,Efficiency,Fast_charge,Range,Top_speed,acceleration..0.100.,price
0,75.0,172,670.0,435,217,5.0,64328.53


In [3]:
# check nullas and duplicates
print(df.isnull().sum())
print(df.duplicated().sum())

Battery                 0
Efficiency              0
Fast_charge             0
Range                   0
Top_speed               0
acceleration..0.100.    0
price                   0
dtype: int64
2


In [4]:
# drop duplicates
df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

0


In [5]:
# check size of the data
df.shape

(305, 7)

In [6]:
# Train Test split
from sklearn.model_selection import train_test_split
X = df.drop('price',axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Scale data -> minMax scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# test different models as LR, SVM, DT, KNN, RF, LightGBM -> CV = 5
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

# Assuming X_train and X_test are already defined and preprocessed
# and that the labels y_train are also defined

# Define regression models to test
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Machine": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "LightGBM": lgb.LGBMRegressor(verbosity=-1)
}

# Cross-validation
cv_results_r2 = {}
cv_results_rmse = {}
for model_name, model in models.items():
    r2_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    rmse_scores = np.sqrt(-cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error'))
    cv_results_r2[model_name] = r2_scores
    cv_results_rmse[model_name] = rmse_scores
    print(f"{model_name} CV Mean R2: {np.mean(r2_scores):.4f} (+/- {np.std(r2_scores):.4f})")
    print(f"{model_name} CV Mean RMSE: {np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores):.4f})")

# If you want to train and evaluate models on the test set, use the code below:
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{model_name} Test R2: {r2:.4f}")
    print(f"{model_name} Test RMSE: {rmse:.4f}")


Linear Regression CV Mean R2: 0.6872 (+/- 0.0273)
Linear Regression CV Mean RMSE: 20865.0305 (+/- 3108.7422)
Support Vector Machine CV Mean R2: -0.1074 (+/- 0.0605)
Support Vector Machine CV Mean RMSE: 39179.6009 (+/- 4993.0506)
Decision Tree CV Mean R2: 0.7554 (+/- 0.0545)
Decision Tree CV Mean RMSE: 18987.5720 (+/- 2812.4465)
K-Nearest Neighbors CV Mean R2: 0.7787 (+/- 0.0934)
K-Nearest Neighbors CV Mean RMSE: 16940.1012 (+/- 3099.9117)
Random Forest CV Mean R2: 0.8532 (+/- 0.0549)
Random Forest CV Mean RMSE: 13587.9272 (+/- 2576.9961)
LightGBM CV Mean R2: 0.8011 (+/- 0.0792)
LightGBM CV Mean RMSE: 16099.1685 (+/- 2125.1721)
Linear Regression Test R2: 0.7592
Linear Regression Test RMSE: 17813.6671
Support Vector Machine Test R2: -0.2327
Support Vector Machine Test RMSE: 40303.5244
Decision Tree Test R2: 0.7726
Decision Tree Test RMSE: 17309.4564
K-Nearest Neighbors Test R2: 0.8516
K-Nearest Neighbors Test RMSE: 13986.1401
Random Forest Test R2: 0.9077
Random Forest Test RMSE: 11027.1

In [13]:
# best one is the RF -> Tune it
# Define the parameter grid to search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Create a Random Forest regressor
rf = RandomForestRegressor()

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_rf = grid_search.best_estimator_

# Evaluate the best model
y_pred = best_rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R2: {r2:.4f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test RMSE: 11349.8847
Test R2: 0.9022


In [16]:
X.iloc[0].values

array([ 75., 172., 670., 435., 217.,   5.])

In [22]:
from sklearn.preprocessing import MinMaxScaler
import joblib

# Assuming X_train is your entire dataset

# Apply MinMaxScaler to the entire dataset
scaler = MinMaxScaler()
Xscaled = scaler.fit_transform(X)

# Retrain your model with the best parameters
best_rf.fit(Xscaled, y)

# Save the trained model
joblib.dump(best_rf, 'electrical_random_forest_model.pkl')

# Save the scaler
joblib.dump(scaler, 'electrical_min_max_scaler.pkl')


['electrical_min_max_scaler.pkl']

In [23]:
import joblib
import numpy as np

def test_model(input_data):
    # Load the trained model
    rf_model = joblib.load('electrical_random_forest_model.pkl')
    scaler = joblib.load('electrical_min_max_scaler.pkl')

    # Reshape the input data to comply with MinMaxScaler
    input_data = np.array(input_data).reshape(1, -1)

    # Apply the scaler to the input data
    X_test_scaled = scaler.transform(input_data)

    # Make predictions using the model
    predictions = rf_model.predict(X_test_scaled)

    return predictions

# Assuming 'X' is your dataset, you can take the first row as an example
new_data = X.iloc[0].values
predictions = test_model(new_data)
print(predictions)


[67890.6064]




In [24]:
y.iloc[0]

64328.53