In [2]:
# train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import pickle

# Load dataset
data = pd.read_csv("cardekho.csv")

# Drop rows with missing values
data = data.dropna()

# Encode categorical columns
le = LabelEncoder()
for col in ['fuel', 'seller_type', 'transmission', 'owner']:
    data[col] = le.fit_transform(data[col])

# Feature selection
X = data[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner', 'engine', 'seats']]
y = data['selling_price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

# Save model with pickle
with open("car_price_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model trained and saved successfully as car_price_model.pkl")
data['fuel'].unique()


R2 Score: 0.9596530255572867
MAE: 86921.62588251987
✅ Model trained and saved successfully as car_price_model.pkl


array([1, 3, 2, 0])

In [3]:
data = pd.read_csv('cardekho.csv')

In [4]:
data['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [13]:
data['seats'].unique() > 50

array([False, False, False, False, False, False, False, False, False,
       False])