In [1]:
# -------------------------------------------------------
# STEP 1: IMPORT LIBRARIES
# -------------------------------------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib


# -------------------------------------------------------
# STEP 2: LOAD DATA
# -------------------------------------------------------
df = pd.read_csv("CAR DETAILS FROM CAR DEKHO.csv")


# -------------------------------------------------------
# STEP 3: SELECT FEATURES
# -------------------------------------------------------
X = df[['name', 'year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]
y = df['selling_price']


# -------------------------------------------------------
# STEP 4: TRAIN TEST SPLIT
# -------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# -------------------------------------------------------
# STEP 5: ONE-HOT ENCODING FOR CATEGORICAL COLUMNS
# -------------------------------------------------------
cat_cols = ['name', 'fuel', 'seller_type', 'transmission', 'owner']

encoder = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)


# -------------------------------------------------------
# STEP 6: LIGHTWEIGHT ML MODEL (Streamlit Friendly)
# -------------------------------------------------------
model = GradientBoostingRegressor(
    n_estimators=150,
    learning_rate=0.07,
    max_depth=3,
    random_state=42
)

model.fit(X_train_encoded, y_train)


# -------------------------------------------------------
# STEP 7: MODEL EVALUATION
# -------------------------------------------------------
y_pred = model.predict(X_test_encoded)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("\nMODEL RESULTS:")
print("---------------------------")
print("RMSE :", rmse)
print("RÂ² Score:", r2)


# -------------------------------------------------------
# STEP 8: REMOVE USER INPUT (Streamlit will handle it)
# -------------------------------------------------------
print("\nUser input removed â€” Streamlit UI will be used.")


# -------------------------------------------------------
# STEP 9: SAVE SMALL MODEL FILES
# -------------------------------------------------------
joblib.dump(model, "car_price_model.pkl", compress=3)
joblib.dump(encoder, "encoder.pkl", compress=3)
joblib.dump(list(X.columns), "feature_names.pkl", compress=3)

print("\nSaved Files:")
print("âœ” car_price_model.pkl")
print("âœ” encoder.pkl")
print("âœ” feature_names.pkl")
print("\nYour model is ready for Streamlit Deployment ðŸš€")



MODEL RESULTS:
---------------------------
RMSE : 374392.7252854512
RÂ² Score: 0.540682959192724

User input removed â€” Streamlit UI will be used.

Saved Files:
âœ” car_price_model.pkl
âœ” encoder.pkl
âœ” feature_names.pkl

Your model is ready for Streamlit Deployment ðŸš€
