In [None]:
# Car Price Regression
This notebook builds a regression model to predict car prices using the provided CSV data.

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

known_cities = [
    "Lahore", "Karachi", "Islamabad", "Rawalpindi", "Peshawar", "Quetta", "Abbottabad",
    "Ahmed Pur East", "Attock", "Bahawalpur", "Bannu", "Bhakkar", "Bhalwal", "Bhimber", "Burewala", "Chakwal",
    "Charsadda", "Chawinda", "Chichawatni", "Chishtian", "Chowk Azam", "D.G.Khan", "Dadyal Ak", "Dera Ismail Khan",
    "Dunia Pur", "Faisalabad", "Fateh Jang", "Gojra", "Gujar Khan", "Gujranwala", "Gujrat", "Hafizabad",
    "Haroonabad", "Hassan Abdal", "Hyderabad", "Jalalpur Pirwala", "Jamshoro", "Jaranwala", "Jhang", "Jhelum",
    "Khanewal", "Khanpur", "Kharian", "Kohat", "Kotla Arab Ali Khan", "Kotli Ak", "Layyah", "Liaqat Pur",
    "Mandi Bahauddin", "Mansehra", "Mardan", "Mian Channu", "Mian Wali", "Mirpur A.K.", "Multan",
    "Muzaffarabad", "Nankana Sahib", "Narowal", "Nowshera", "Okara", "Phalia", "Pind Dadan Khan", "Pir Mahal",
    "Qila Deedar Singh", "Rahim Yar Khan", "Raiwind", "Sadiqabad", "Sahiwal", "Sara-E-Alamgir", "Sargodha",
    "Sheikhupura", "Shorkot", "Sialkot", "Sukkur", "Swabi", "Takhtbai", "Talagang", "Taunsa Sharif",
    "Toba Tek Singh", "Vehari", "Wah Cantt", "Wazirabad"
]

# Load the data


df = pd.read_csv('civic-x.csv')
df.head()

# Preprocessing
# Convert categorical columns to category dtype
#df['transmission'] = df['transmission'].astype('category')
#df['trim'] = df['trim'].astype('category')
# Standardize city names (strip and title case)
df['city'] = df['city'].str.strip().str.title()

# Replace cities not in known_cities with 'Other'
df['city'] = df['city'].apply(lambda x: x if x in known_cities else 'Other')

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['trim'], drop_first=True)
df = pd.get_dummies(df, columns=['city'], drop_first=True)

# Features and target
X = df.drop('price', axis=1)
y = df['price']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100,200,250],
    'max_depth': [1,2,3],
    'learning_rate': [0.02,0.03,0.04,0.05]
}
xgb = XGBRegressor(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
# Best estimator
xgb_best = grid_search.best_estimator_
# Predict
y_pred_xgb = xgb_best.predict(X_test)

# Evaluation
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost (tuned) MAE: {mae_xgb:.0f}")
print(f"XGBoost (tuned) R^2: {r2_xgb:.3f}")
print("Best parameters:", grid_search.best_params_)

# Predict price for a user-input sample car


In [None]:
# Example input: change these values as needed
sample = {
    'modelYear': 2019,
    'mileage': 70000,
    #'transmission_Automatic': 1,  # 1 if Automatic, 0 if Manual
    'trim_oriel': 1,              # 1 if Oriel, 0 if Standard
    'city_Islamabad': 1             # Set 1 for the desired city, 0 for others
}

# Fill missing columns with 0 (in case some dummies are missing)
for col in X.columns:
    if col not in sample:
        sample[col] = 0

sample_df = pd.DataFrame([sample])[X.columns]
predicted_price_xgb = xgb_best.predict(sample_df)[0]
print(f"Predicted price (XGBoost tuned): {predicted_price_xgb:,.0f}")