In [1]:
# train_model.py
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# ---------------------- DATA CLEANING ---------------------- #
data = pd.read_csv('quikr_car.csv')

data = data[data['year'].str.isnumeric()]
data['year'] = data['year'].astype(int)
data = data[data['Price'] != 'Ask For Price']
data['Price'] = data['Price'].str.replace(',', '').astype(int)
data['kms_driven'] = data['kms_driven'].str.split(' ').str.get(0).str.replace(',', '')
data = data[data['kms_driven'].str.isnumeric()]
data['kms_driven'] = data['kms_driven'].astype(int)
data = data[~data['fuel_type'].isna()]
data['name'] = data['name'].str.split(' ').str.slice(0, 3).str.join(' ')
data = data[data['Price'] < 6e6].reset_index(drop=True)

# ---------------------- FEATURES ---------------------- #
X = data.drop(columns='Price')
y = data['Price']

# ---------------------- MODEL TRAINING ---------------------- #
ohe = OneHotEncoder()
ohe.fit(X[['name', 'company', 'fuel_type']])

column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['name', 'company', 'fuel_type']),
    remainder='passthrough'
)

best_score = 0
best_model = None

for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(column_trans, lr)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score = r2_score(y_test, y_pred)
    if score > best_score:
        best_score = score
        best_model = pipe

print(f"Best R² score: {best_score}")

# ---------------------- SAVE MODEL ---------------------- #
# Save OneHotEncoder categories
with open('encoder_categories.pkl', 'wb') as f:
    pickle.dump(ohe.categories_, f)

# Save trained pipeline
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Model and encoder categories saved successfully.")


Best R² score: 0.8991138463319752
Model and encoder categories saved successfully.
