# Import required libraries for data processing

In [3]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')



# 2. Load the used bike data for analysis and display its shape and first few rows

In [4]:
df_train = pd.read_csv('../../../datasets/used_bike/Used_Bikes.csv')
print(df_train.shape)
df_train.head()


(32648, 8)


Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


# Clean brand names

In [6]:

df_train['brand'] = df_train['brand'].str.strip()


# # Brand Category Filtering


In [7]:
min_count = 50
brand_counts = df_train['brand'].value_counts()
valid_brands = brand_counts[brand_counts >= min_count].index
df_train = df_train[df_train['brand'].isin(valid_brands)]


In [8]:
df_train = df_train.drop(columns=[
    'brand_category_budget', 
    'brand_category_mid', 
    'brand_category_premium'
], errors='ignore')


# Feature engineering

In [9]:
# Step 1: Map brand → category
brand_category_map = {
    'Bajaj': 'budget', 'Hero': 'budget', 'Honda': 'budget', 'TVS': 'budget',
    'Yamaha': 'budget', 'Suzuki': 'budget', 'Mahindra': 'budget',
    'Royal Enfield': 'mid', 'KTM': 'mid', 'Jawa': 'mid', 'Hyosung': 'mid',
    'Harley-Davidson': 'premium', 'Kawasaki': 'premium', 'Benelli': 'premium',
    'Triumph': 'premium', 'Ducati': 'premium', 'BMW': 'premium'
}

df_train['brand_category'] = df_train['brand'].map(brand_category_map)

# Step 2: Drop rows with unknown brands (NaN in brand_category)
df_train = df_train.dropna(subset=['brand_category'])

# Step 3: One-hot encode the category
df_train = pd.get_dummies(df_train, columns=['brand_category'], drop_first=False)

# Optional: Convert from bool to int
bool_cols = ['brand_category_budget', 'brand_category_mid', 'brand_category_premium']
df_train[bool_cols] = df_train[bool_cols].astype(int)


# feature encoding

In [10]:
brand_category_label_map = {'budget': 0, 'mid': 1, 'premium': 2}
df_train['brand_category_encoded'] = df_train['brand'].map(brand_category_label_map)


In [11]:
print("Budget:", df_train['brand_category_budget'].sum())
print("Mid:", df_train['brand_category_mid'].sum())
print("Premium:", df_train['brand_category_premium'].sum())


Budget: 26371
Mid: 5319
Premium: 872


In [12]:
df_train['owner_grouped'] = df_train['owner'].replace({
    'Third Owner': 'Multiple Owners',
    'Fourth Owner Or More': 'Multiple Owners'
})

owner_map = {
    'First Owner': 0,
    'Second Owner': 1,
    'Multiple Owners': 2
}
df_train['owner_grouped_encoded'] = df_train['owner_grouped'].map(owner_map)
print(df_train['owner_grouped'].value_counts())

owner_grouped
First Owner        29890
Second Owner        2555
Multiple Owners      117
Name: count, dtype: int64


In [13]:
df_train.dtypes

bike_name                  object
price                     float64
city                       object
kms_driven                float64
owner                      object
age                       float64
power                     float64
brand                      object
brand_category_budget       int64
brand_category_mid          int64
brand_category_premium      int64
brand_category_encoded    float64
owner_grouped              object
owner_grouped_encoded       int64
dtype: object

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32562 entries, 0 to 32647
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   bike_name               32562 non-null  object 
 1   price                   32562 non-null  float64
 2   city                    32562 non-null  object 
 3   kms_driven              32562 non-null  float64
 4   owner                   32562 non-null  object 
 5   age                     32562 non-null  float64
 6   power                   32562 non-null  float64
 7   brand                   32562 non-null  object 
 8   brand_category_budget   32562 non-null  int64  
 9   brand_category_mid      32562 non-null  int64  
 10  brand_category_premium  32562 non-null  int64  
 11  brand_category_encoded  0 non-null      float64
 12  owner_grouped           32562 non-null  object 
 13  owner_grouped_encoded   32562 non-null  int64  
dtypes: float64(5), int64(4), object(5)
memory u

# Feature Selection

In [15]:
X= df_train[['age', 'kms_driven', 'power', 'brand_category_encoded', 'owner_grouped_encoded']]
y = df_train['price']


# Train-Test Split (Data Splitting)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=38)


# feature Scaling

In [None]:
# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_lsvm_scaled = scaler.fit_transform(X_train_lsvm)
X_test_lsvm_scaled = scaler.transform(X_test_lsvm)


In [17]:
total_bikes = df_train['brand'].value_counts().sum()
print("Total number of bikes:", total_bikes)


Total number of bikes: 32562


# modelBUilding


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# model training

In [21]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models_tree = {
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, verbosity=0)
}

for name, model in models_tree.items():
    model.fit(X_train, y_train)
    print(f"{name} Training R²: {model.score(X_train, y_train):.4f}")
    print(f"{name} Test R²: {model.score(X_test, y_test):.4f}\n")


Decision Tree Training R²: 0.9980
Decision Tree Test R²: 0.9462

Random Forest Training R²: 0.9936
Random Forest Test R²: 0.9534

XGBoost Training R²: 0.9909
XGBoost Test R²: 0.9522



In [None]:
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)


## 💾 Model Saving / Exporting


In [None]:
import joblib
# Save new model
joblib.dump(model, 'model/usebike_model_fixed.pkl')  # save with different name just to be safe