In [279]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [280]:
data=pd.read_csv('train.csv')

In [281]:
test_data=pd.read_csv('test.csv')

In [282]:
numerical_features=['milage','model_year','price']

In [283]:
categorical_cols=['brand','model','fuel_type','transmission','int_col','accident','clean_title']

In [284]:
import re
def extract_horsepower(engine_str):
    match=re.search(r'(\d+\.?\d*)HP', engine_str)
    if match:
        return float(match.group(1))
    else:
        return None

In [285]:
data['horsepower']=data['engine'].apply(extract_horsepower)

In [286]:
test_data['horsepower']=data['engine'].apply(extract_horsepower)

In [287]:
data['fuel_type'].fillna('Gasoline',inplace=True)
data['accident'].fillna('None reported',inplace=True)
data['clean_title'].fillna('Yes',inplace=True)

In [288]:
data['horsepower'].fillna(data['horsepower'].median(),inplace=True)

In [289]:
test_data['fuel_type'].fillna('Gasoline',inplace=True)
test_data['accident'].fillna('None reported',inplace=True)
test_data['clean_title'].fillna('Yes',inplace=True)
test_data['horsepower'].fillna(data['horsepower'].median(),inplace=True)

In [290]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,LabelEncoder
from sklearn.model_selection import GridSearchCV
x=data.drop(columns=['price','id','engine','clean_title'])
y=data['price']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [291]:
x_2=x.copy()
y_2=y.copy()
test_data_2=test_data.copy()

In [292]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [293]:
categoric_cols=['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident']
numeric_cols=['model_year', 'milage', 'horsepower']

In [294]:
le=LabelEncoder()

In [295]:
for i in categoric_cols:
    data[i]=le.fit_transform(data[i])
    test_data[i]=le.fit_transform(test_data[i])

In [296]:
preprocessor=ColumnTransformer(
    transformers=[
        ('numerical_features',StandardScaler(),numeric_cols),
    ]
)

In [297]:
model=xgb.XGBRegressor(max_depth=3,n_estimators=100)

In [298]:
model_pipeline=Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',model)
])

In [299]:
data.drop(columns=['engine','id','clean_title']).corr()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,price,horsepower
brand,1.0,-0.040049,-0.014605,-0.007286,0.043534,0.036984,0.006038,-0.004765,0.005564,0.017657,-0.048659
model,-0.040049,1.0,0.001784,0.041706,0.009617,-0.027031,0.00756,0.07699,-0.017556,-0.029865,-0.033289
model_year,-0.014605,0.001784,1.0,-0.669936,0.010908,0.043596,-0.032189,0.024833,0.256367,0.231795,0.351653
milage,-0.007286,0.041706,-0.669936,1.0,-0.105261,-0.041158,0.027763,-0.030846,-0.321356,-0.283067,-0.388762
fuel_type,0.043534,0.009617,0.010908,-0.105261,1.0,0.093713,-0.010751,0.004114,0.036566,0.015427,-0.013805
transmission,0.036984,-0.027031,0.043596,-0.041158,0.093713,1.0,0.005435,-0.002534,-0.012481,0.01456,0.015964
ext_col,0.006038,0.00756,-0.032189,0.027763,-0.010751,0.005435,1.0,0.067904,-0.020224,-0.017342,-0.033395
int_col,-0.004765,0.07699,0.024833,-0.030846,0.004114,-0.002534,0.067904,1.0,0.000719,0.035174,0.018071
accident,0.005564,-0.017556,0.256367,-0.321356,0.036566,-0.012481,-0.020224,0.000719,1.0,0.125122,0.183637
price,0.017657,-0.029865,0.231795,-0.283067,0.015427,0.01456,-0.017342,0.035174,0.125122,1.0,0.207837


In [300]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200],
    "model__max_depth": [3, 5,7]
}
    

In [301]:
grid_search=GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

In [302]:
y_train

184031     24000
173831      8250
183819     30000
85525      45999
41872      42000
           ...  
119879     10000
103694     35900
131932     99750
146867      6730
121958    179900
Name: price, Length: 150826, dtype: int64

In [303]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__max_depth': [3, 5, ...], 'model__n_estimators': [100, 200]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('numerical_features', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [304]:
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (RMSE):", -grid_search.best_score_)
print("Test Score (RMSE):", -grid_search.score(x_test, y_test))

Best Parameters: {'model__max_depth': 3, 'model__n_estimators': 100}
Best CV Score (RMSE): 74805.821875
Test Score (RMSE): 69321.0859375


In [305]:
test_data_final=test_data.drop(columns=['id','engine','clean_title'])
test_data_2_final=test_data_2.drop(columns=['id','engine','clean_title'])

In [306]:
model_pipeline.fit(x,y)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numerical_features', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [307]:
test_data_pred=model_pipeline.predict(test_data_final)

In [308]:
submission_df_two=pd.DataFrame({
    'id':test_data['id'],
    'price':test_data_pred
})

In [309]:
submission_df_two.to_csv('submission_two.csv',index=False)

In [310]:
preprocessor_2=ColumnTransformer(
    transformers=[
        ('numeric_cols',StandardScaler(),numeric_cols),
        ('categroic_cols',OneHotEncoder(handle_unknown='ignore'),categoric_cols)
    ]
)

In [311]:
pipe_line_2=Pipeline(
    steps=[
        ('preprocessing_2',preprocessor_2),
        ('model',model)
    ]
)

In [312]:
pipe_line_2.fit(x_2,y_2)

0,1,2
,steps,"[('preprocessing_2', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric_cols', ...), ('categroic_cols', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [315]:
test_data_preds=pipe_line_2.predict(test_data_2)

In [316]:
submission_df_three=pd.DataFrame({
    'id':test_data['id'],
    'price':test_data_preds
})

In [317]:
submission_df_three.to_csv('submission_three.csv',index=False)