In [285]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [286]:
data=pd.read_csv('train.csv')

In [287]:
test_data=pd.read_csv('test.csv')

In [288]:
data.isnull().sum().sort_values(ascending=False)

clean_title     21419
fuel_type        5083
accident         2452
id                  0
brand               0
model               0
model_year          0
milage              0
engine              0
transmission        0
ext_col             0
int_col             0
price               0
dtype: int64

In [289]:
numerical_features=['milage','model_year','price']

In [290]:
categorical_cols=['brand','model','fuel_type','transmission','int_col','accident','clean_title']

In [291]:
import re
def extract_horsepower(engine_str):
    match=re.search(r'(\d+\.?\d*)HP', engine_str)
    if match:
        return float(match.group(1))
    else:
        return None

In [292]:
data['horsepower']=data['engine'].apply(extract_horsepower)

In [293]:
test_data['horsepower']=data['engine'].apply(extract_horsepower)

In [294]:
data[['engine','horsepower']].head()

Unnamed: 0,engine,horsepower
0,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,172.0
1,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,252.0
2,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,320.0
3,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,420.0
4,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,208.0


In [295]:
data['fuel_type'].fillna('Gasoline',inplace=True)
data['accident'].fillna('None reported',inplace=True)
data['clean_title'].fillna('Yes',inplace=True)

In [296]:
data['horsepower'].fillna(data['horsepower'].median(),inplace=True)

In [297]:
test_data['fuel_type'].fillna('Gasoline',inplace=True)
test_data['accident'].fillna('None reported',inplace=True)
test_data['clean_title'].fillna('Yes',inplace=True)

In [298]:
test_data['horsepower'].fillna(data['horsepower'].median(),inplace=True)

In [299]:
data.isnull().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
horsepower      0
dtype: int64

In [300]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.model_selection import GridSearchCV
x=data.drop(columns=['price','id','engine'])
y=data['price']

In [301]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [302]:
categoric_cols=['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
numeric_cols=['model_year', 'milage', 'horsepower']

In [303]:
preprocessor=ColumnTransformer(
    transformers=[
        ('numerical_features',StandardScaler(),numeric_cols),
        ('categorical_cols',OneHotEncoder(handle_unknown='ignore'),categoric_cols)
    ]
)

In [304]:
model=xgb.XGBRegressor(random_state=42)

In [305]:
model_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',model)
])

In [306]:
model_pipeline.fit(x,y)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical_features', ...), ('categorical_cols', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [310]:
test_features=test_data.drop(columns=['id','engine'])

In [312]:
test_predictions=model_pipeline.predict(test_features)

In [314]:
submission_df=pd.DataFrame({
    'id':test_data['id'],
    'price':test_predictions
})

In [315]:
submission_df.to_csv('submission.csv',index=False)