In [7]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split

# 1. Load & Combine
train_raw = pd.read_csv(r"C:\Users\samar\car_price\old-_car_price_regression\data\train.csv")
test_raw = pd.read_csv(r"C:\Users\samar\car_price\old-_car_price_regression\data\test.csv")

# Mark them to split later
train_raw['is_train'] = 1
test_raw['is_train'] = 0

# Combine (ignore_index=True to reset index)
# Test set 'price' will become NaN, which is fine
df_all = pd.concat([train_raw, test_raw], axis=0, ignore_index=True)

# 2. Preprocess Function
def preprocess_data(df):
    df = df.copy()
    
    # Feature Extraction
    def extract(text, pattern, type_fn=float):
        match = re.search(pattern, str(text))
        return type_fn(match.group(1)) if match else None

    df['hp'] = df['engine'].apply(lambda x: extract(x, r'(\d+\.?\d*)HP', float))
    df['liters'] = df['engine'].apply(lambda x: extract(x, r'(\d+\.?\d*)L', float))
    df['cylinders'] = df['engine'].apply(lambda x: extract(x, r'(\d+)\s+Cylinder', int))
    df['trans_speed'] = df['transmission'].apply(lambda x: extract(x, r'(\d+)-Speed', int))

    # Impute Missing (using overall median)
    df['hp'] = df['hp'].fillna(df['hp'].median())
    df['liters'] = df['liters'].fillna(df['liters'].median())
    df['cylinders'] = df['cylinders'].fillna(6)
    df['trans_speed'] = df['trans_speed'].fillna(0)
    
    # Binary
    df['accident_clean'] = df['accident'].apply(lambda x: 1 if str(x).startswith('At least') else 0)

    # Drop Unused
    drop_cols = ['id', 'clean_title', 'model', 'engine', 'transmission', 'ext_col', 'int_col', 'accident']
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])
    
    return df

# 3. Apply to Combined Data
df_all_clean = preprocess_data(df_all)

# 4. One-Hot Encode (Handles all brands present in both Train and Test)
df_all_encoded = pd.get_dummies(df_all_clean, columns=['brand', 'fuel_type'], drop_first=True)

# 5. Split Back
train_final = df_all_encoded[df_all_encoded['is_train'] == 1].drop(columns=['is_train'])
test_final = df_all_encoded[df_all_encoded['is_train'] == 0].drop(columns=['is_train', 'price'])

# 6. Train Model
X = train_final.drop(columns=['price'])
y = train_final['price']

model = xgb.XGBRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=6, random_state=42, n_jobs=-1
)

model.fit(X, y)
print("Model Trained Successfully!")

# 7. Predict
preds = model.predict(test_final)
print("Predictions generated:", preds[:5])

Model Trained Successfully!
Predictions generated: [17209.592 84786.36  60590.65  27198.686 30323.992]


In [6]:
X.shape

(314223, 70)

In [8]:
import pandas as pd
import pickle

# 1. Load the Model Package
print("Loading model...")
with open('car_price_model_package.pkl', 'rb') as f:
    package = pickle.load(f)

model = package['model']
feature_names = package['feature_names'] # The column names the model expects

# 2. Get the Test Data (from df_all_encoded which you should have in memory)
# If not, recreate it or assuming you are running this in the same notebook session:
# We need to ensure we select exact same columns in same order
X_test_submission = test_final[feature_names] 

# 3. Predict
print("Predicting...")
predictions = model.predict(X_test_submission)

# 4. Create Submission DataFrame
submission = pd.DataFrame({
    'id': test_raw['id'], # Make sure to use the original IDs
    'price': predictions
})

# 5. Save
output_path = 'submission.csv'
submission.to_csv(output_path, index=False)
print(f"Saved submission to {output_path}")
print(submission.head())


Loading model...
Predicting...
Saved submission to submission.csv
       id         price
0  188533  17179.255859
1  188534  94915.609375
2  188535  57254.710938
3  188536  28639.097656
4  188537  30208.062500
