In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import re

In [14]:
df = pd.read_csv("used_car_price.csv", encoding="latin1")

df.head()

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [15]:
def extract_number(text):
    num = re.findall(r"[\d\.]+", str(text))
    if num:  # Check if the list is not empty
        return float(num[0])
    else:
        return None  # or np.nan

# Apply cleaning
df['Mileage'] = df['Mileage'].apply(extract_number)
df['Engine'] = df['Engine'].apply(extract_number)
df['Power'] = df['Power'].apply(extract_number)

In [16]:
df

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,2010,72000,CNG,Manual,First,26.60,998.0,58.16,5.0,1.75
1,2015,41000,Diesel,Manual,First,19.67,1582.0,126.20,5.0,12.50
2,2011,46000,Petrol,Manual,First,18.20,1199.0,88.70,5.0,4.50
3,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.00
4,2013,40670,Diesel,Automatic,Second,15.20,1968.0,140.80,5.0,17.74
...,...,...,...,...,...,...,...,...,...,...
60,2021,16000,Diesel,Automatic,First,23.20,1493.0,113.00,7.0,13.90
61,2012,91000,Petrol,Manual,Third,15.80,1086.0,68.00,5.0,2.60
62,2013,79000,Diesel,Manual,Second,20.70,1248.0,88.00,5.0,4.70
63,2014,67000,Petrol,Manual,Second,17.60,1198.0,82.00,5.0,4.10


In [17]:
df.dropna(inplace=True)
df.shape

(64, 10)

In [18]:
# Features and target
X = df.drop('Price', axis=1)
y = df['Price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Categorical and numerical columns
categorical_features = ['Fuel_Type', 'Transmission', 'Owner_Type']
numerical_features = ['Year', 'Kilometers_Driven', 'Seats','Mileage','Engine','Power']

# Preprocessing
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Train
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Predict
y_pred = model_pipeline.predict(X_test)

# Metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R2 Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")

R2 Score: 0.8613
MSE: 1.1431
RMSE: 1.0691


In [21]:
def predict_car_price(model_pipeline, year, kilometers_driven, fuel_type, transmission, owner_type, seats, Mileage, Engine, Power):
    # Create a DataFrame including Mileage, Engine, and Power
    input_data = pd.DataFrame({
        'Year': [year],
        'Kilometers_Driven': [kilometers_driven],
        'Fuel_Type': [fuel_type],
        'Transmission': [transmission],
        'Owner_Type': [owner_type],
        'Seats': [seats],
        'Mileage': [Mileage],  # Added Mileage
        'Engine': [Engine],    # Added Engine
        'Power': [Power]       # Added Power
    })
    
    # Predict the price using the model pipeline
    prediction = model_pipeline.predict(input_data)
    
    return prediction[0]

In [22]:
predicted_price = predict_car_price(
    model_pipeline=model_pipeline,
    year=2010,
    kilometers_driven=72000,
    fuel_type='CNG',
    transmission='Manual',
    owner_type='First',
    seats=5.0,
    Mileage=26.6,  # Mileage in km/kg
    Engine=998,    # Engine in CC
    Power=58.16    # Power in bhp
)

print(f"Predicted Price: {predicted_price:.2f} Lakh")

Predicted Price: 3.04 Lakh


In [23]:
import pickle 

pickle.dump(model_pipeline,open("pipeline.pkl",'wb'))