In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib   # for saving model

# Load the cleaned dataset
df = pd.read_csv("C:\\EV-Project\\data\\cars_data_cleaned.csv")
print("âœ… Dataset loaded successfully! Shape:", df.shape)
df.head()


âœ… Dataset loaded successfully! Shape: (353, 15)


Unnamed: 0,row_id,title,model,battery,price_range,zero_to_hundred,top_speed,fastcharge,germany_price_before_incentives,netherlands_price_before_incentives,uk_price_after_incentives,drive_configuration,tow_hitch,towing_capacity_in_kg,number_of_seats
0,1,Tesla,Model 3,57.5,104,6.1,201,690 km/h,"â‚¬41,970","â‚¬41,990","Â£39,990",Rear Wheel Drive,Towbar possible,1000.0,5
1,2,Tesla,Model Y,57.5,137,6.9,217,580 km/h,"â‚¬45,970","â‚¬45,990","Â£44,990",Rear Wheel Drive,Towbar possible,1600.0,5
2,3,Tesla,Model Y Long Range Dual Motor,75.0,130,5.0,217,670 km/h,"â‚¬55,970","â‚¬52,990","Â£52,990",All Wheel Drive,Towbar possible,1600.0,5
3,4,Tesla,Model 3 Long Range Dual Motor,75.0,106,4.4,201,770 km/h,"â‚¬50,970","â‚¬49,990","Â£49,990",All Wheel Drive,Towbar possible,1000.0,5
4,5,Tesla,Model Y Long Range RWD,75.0,109,5.9,217,710 km/h,"â‚¬49,970","â‚¬49,990","Â£31,995",Rear Wheel Drive,Towbar possible,1600.0,5


In [5]:
print("Columns:\n", df.columns.tolist())

# Drop unused columns if present
drop_cols = ['row_id', 'title', 'model', 'fastcharge',
             'germany_price_before_incentives',
             'netherlands_price_before_incentives',
             'uk_price_after_incentives']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Select features & target
features = ['battery', 'top_speed', 'zero_to_hundred',
            'towing_capacity_in_kg', 'number_of_seats']
target = 'price_range'

X = df[features]
y = df[target]

# Drop NaNs
X = X.dropna()
y = y.loc[X.index]

print("âœ… Features ready:", X.shape)


Columns:
 ['row_id', 'title', 'model', 'battery', 'price_range', 'zero_to_hundred', 'top_speed', 'fastcharge', 'germany_price_before_incentives', 'netherlands_price_before_incentives', 'uk_price_after_incentives', 'drive_configuration', 'tow_hitch', 'towing_capacity_in_kg', 'number_of_seats']
âœ… Features ready: (353, 5)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train size:", X_train.shape, " | Test size:", X_test.shape)


Train size: (282, 5)  | Test size: (71, 5)


In [7]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    print(f"ðŸ”¹ {name} â†’ RÂ²: {r2:.3f} | MAE: {mae:.2f} | MSE: {mse:.2f}")
    return {"Model": name, "R2": r2, "MAE": mae, "MSE": mse}


In [8]:
results = []

# 1. Linear Regression
lr = LinearRegression()
results.append(evaluate_model("Linear Regression", lr, X_train, X_test, y_train, y_test))

# 2. Decision Tree
dt = DecisionTreeRegressor(random_state=42, max_depth=6)
results.append(evaluate_model("Decision Tree", dt, X_train, X_test, y_train, y_test))

# 3. Random Forest
rf = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=8)
results.append(evaluate_model("Random Forest", rf, X_train, X_test, y_train, y_test))


ðŸ”¹ Linear Regression â†’ RÂ²: 0.263 | MAE: 48.18 | MSE: 7737.37
ðŸ”¹ Decision Tree â†’ RÂ²: 0.538 | MAE: 35.63 | MSE: 4856.47
ðŸ”¹ Random Forest â†’ RÂ²: 0.459 | MAE: 34.95 | MSE: 5686.60


In [9]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="R2", ascending=False)


Unnamed: 0,Model,R2,MAE,MSE
1,Decision Tree,0.537626,35.63487,4856.473016
2,Random Forest,0.458591,34.950313,5686.60457
0,Linear Regression,0.263342,48.182182,7737.369596


In [16]:
best_model = max(results, key=lambda x: x["R2"])["Model"]

if best_model == "Linear Regression":
    final_model = lr
elif best_model == "Decision Tree":
    final_model = dt
else:
    final_model = rf

# Retrain on full data
final_model.fit(X, y)

# Save to file
joblib.dump(final_model, "model.pkl")
print(f"âœ… Best model ({best_model}) saved successfully as 'model.pkl'")


âœ… Best model (Decision Tree) saved successfully as 'model.pkl'


In [17]:
sample = X.sample(1, random_state=1)
print("Sample Input:\n", sample)
pred_price = final_model.predict(sample)
print("\nðŸ’° Predicted Price Range:", round(pred_price[0], 2))


Sample Input:
      battery  top_speed  zero_to_hundred  towing_capacity_in_kg  \
150    106.0        200              5.6                 1800.0   

     number_of_seats  
150                5  

ðŸ’° Predicted Price Range: 152.89


In [None]:
## âœ… Week 2 Summary
# - Loaded cleaned dataset from Week 1  
# - Trained and compared 3 models: Linear Regression, Decision Tree, Random Forest  
# - Evaluated models with RÂ², MAE, MSE  
# - Saved best model as `model.pkl` for Streamlit app (Week 3)  
