In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

file_path = 'sub_dataset.csv'
df = pd.read_csv(file_path)

X = df[['Model', 'Mileage', 'Manufacturer', 'Stolen', 'Model Year']]
y = df['Price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Build pipeline
categorical = ['Model', 'Manufacturer', 'Stolen']
numerical = ['Mileage', 'Model Year']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', 'passthrough', numerical)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train
model.fit(X_train, y_train)
joblib.dump(model, 'car_price_model.pkl')


In [None]:
#check if all values in coloumn 'Price' is a numeric or convertible to numeric
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False
    
#show values that are not numeric
non_numeric_prices = df[~df['Price'].apply(is_numeric)]
if not non_numeric_prices.empty:
    print("Non-numeric values found in 'Price' column:")
    print(non_numeric_prices)
else:
    print("All values in 'Price' column are numeric or convertible to numeric.")

In [None]:
print(X['Mileage'].unique())
print(X['Mileage'].dtype)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # squared=False gives RMSE
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs. Predicted Car Prices")
plt.grid(True)
plt.show()

In [None]:
#check if all values in coloumn 'Price' is a numeric or convertible to numeric
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False
    
#show values that are not numeric
non_numeric_prices = df[~df['Price'].apply(is_numeric)]
if not non_numeric_prices.empty:
    print("Non-numeric values found in 'Price' column:")
    print(non_numeric_prices)
else:
    print("All values in 'Price' column are numeric or convertible to numeric.")

In [None]:
tolerance = 0.20  # 20%

# Compute % error for each prediction
relative_errors = abs(y_pred - y_test) / y_test

# Count how many predictions fall within ±20%
accuracy_within_20pct = (relative_errors <= tolerance).mean()

print(f"Prediction accuracy within ±20%: {accuracy_within_20pct * 100:.2f}%")


In [None]:
# Create a DataFrame with one row for your input car
input_data = pd.DataFrame({
    'Model': ['Outlander'],
    'Mileage': [80000],         # adjust mileage as needed
    'Manufacturer': ['Mitsubishi'],
    'Stolen': ['No'],
    'Model Year': [2015]
})

# Use your trained pipeline (named 'model' in your code)
predicted_price = model.predict(input_data)

print(f"Predicted price for 2015 Mitsubishi Outlander with 50,000 mileage: ${predicted_price[0]:.2f}")