In [232]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [192]:
# Load datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [193]:
# Get the ID's from both sets
train_ids = train_df['id']
test_ids = test_df['id']

In [194]:
# Function to extract engine displacement
def extract_engine_displacement(engine_str):
    match = re.search(r'(\d+\.\d+)L', engine_str)
    return float(match.group(1)) if match else None

# Function to extract engine type
def extract_engine_type(engine_str):
    match = re.search(r'(V\d+|Straight \d+|\d+ Cylinder)', engine_str)
    return match.group(1) if match else 'Unknown'

# Feature Engineering: Extracting useful information from 'engine'
train_df['engine_hp'] = train_df['engine'].str.extract(r'(\d+\.\d+)HP').astype(float)
test_df['engine_hp'] = test_df['engine'].str.extract(r'(\d+\.\d+)HP').astype(float)

# Apply functions to extract features
train_df['engine_displacement'] = train_df['engine'].apply(extract_engine_displacement)
test_df['engine_displacement'] = test_df['engine'].apply(extract_engine_displacement)

train_df['engine_type'] = train_df['engine'].apply(extract_engine_type)
test_df['engine_type'] = test_df['engine'].apply(extract_engine_type)

# Drop 'engine' column
train_df = train_df.drop('engine', axis=1)
test_df = test_df.drop('engine', axis=1)

In [195]:
train_df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,engine_hp,engine_displacement,engine_type
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,10-Speed A/T,Blue,Gray,None reported,Yes,11000,375.0,3.5,V6
1,1,BMW,335 i,2007,80000,Gasoline,6-Speed M/T,Black,Black,None reported,Yes,8250,300.0,3.0,Straight 6
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,6-Speed A/T,Purple,Beige,None reported,Yes,15000,300.0,4.2,8 Cylinder
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500,335.0,3.0,Straight 6
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,A/T,White,Black,None reported,Yes,7850,200.0,3.8,V6


In [196]:
test_df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,engine_hp,engine_displacement,engine_type
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,A/T,White,Beige,None reported,Yes,302.0,3.5,V6
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,8-Speed A/T,Silver,Black,None reported,Yes,275.0,3.5,V6
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,7-Speed A/T,Blue,White,None reported,Yes,241.0,2.0,4 Cylinder
3,54276,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,Transmission w/Dual Shift Mode,White,White,At least 1 accident or damage reported,Yes,518.0,5.0,8 Cylinder
4,54277,BMW,X6 xDrive40i,2020,90000,Gasoline,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes,335.0,3.0,Straight 6


In [197]:
# Simplify the 'transmission' column
train_df['transmission'] = train_df['transmission'].str.split().str[0]
test_df['transmission'] = test_df['transmission'].str.split().str[0]


In [198]:
# Reducing cardinality of 'ext_col' and 'int_col'
top_ext_colors = train_df['ext_col'].value_counts().index[:5]
train_df['ext_col'] = train_df['ext_col'].apply(lambda x: x if x in top_ext_colors else 'Other')
test_df['ext_col'] = test_df['ext_col'].apply(lambda x: x if x in top_ext_colors else 'Other')

top_int_colors = train_df['int_col'].value_counts().index[:5]
train_df['int_col'] = train_df['int_col'].apply(lambda x: x if x in top_int_colors else 'Other')
test_df['int_col'] = test_df['int_col'].apply(lambda x: x if x in top_int_colors else 'Other')


In [199]:
train_df.ext_col.value_counts()

ext_col
Black     15078
White     13422
Other      8035
Gray       7909
Silver     5161
Blue       4668
Name: count, dtype: int64

In [200]:
train_df.int_col.value_counts()

int_col
Black    31820
Beige     7995
Gray      6252
Other     4468
Brown     2239
Red       1499
Name: count, dtype: int64

In [201]:
train_df.head(5)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,engine_hp,engine_displacement,engine_type
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,10-Speed,Blue,Gray,None reported,Yes,11000,375.0,3.5,V6
1,1,BMW,335 i,2007,80000,Gasoline,6-Speed,Black,Black,None reported,Yes,8250,300.0,3.0,Straight 6
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,6-Speed,Other,Beige,None reported,Yes,15000,300.0,4.2,8 Cylinder
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,Transmission,Gray,Brown,None reported,Yes,63500,335.0,3.0,Straight 6
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,A/T,White,Black,None reported,Yes,7850,200.0,3.8,V6


In [202]:
train_df.clean_title.value_counts()

clean_title
Yes    54273
Name: count, dtype: int64

In [203]:
#drop unnecessary columns
train_df = train_df.drop(['clean_title', 'id'], axis=1)
test_df = test_df.drop(['clean_title', 'id'], axis=1)

In [204]:
train_df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,price,engine_hp,engine_displacement,engine_type
0,Ford,F-150 Lariat,2018,74349,Gasoline,10-Speed,Blue,Gray,None reported,11000,375.0,3.5,V6
1,BMW,335 i,2007,80000,Gasoline,6-Speed,Black,Black,None reported,8250,300.0,3.0,Straight 6
2,Jaguar,XF Luxury,2009,91491,Gasoline,6-Speed,Other,Beige,None reported,15000,300.0,4.2,8 Cylinder
3,BMW,X7 xDrive40i,2022,2437,Hybrid,Transmission,Gray,Brown,None reported,63500,335.0,3.0,Straight 6
4,Pontiac,Firebird Base,2001,111000,Gasoline,A/T,White,Black,None reported,7850,200.0,3.8,V6


In [205]:
test_df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,engine_hp,engine_displacement,engine_type
0,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,A/T,White,Beige,None reported,302.0,3.5,V6
1,Lexus,RX 350 Base,2015,128032,Gasoline,8-Speed,Silver,Black,None reported,275.0,3.5,V6
2,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,7-Speed,Blue,Other,None reported,241.0,2.0,4 Cylinder
3,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,Transmission,White,Other,At least 1 accident or damage reported,518.0,5.0,8 Cylinder
4,BMW,X6 xDrive40i,2020,90000,Gasoline,8-Speed,White,Black,At least 1 accident or damage reported,335.0,3.0,Straight 6


In [206]:
# Define features and target
X = train_df.drop('price', axis=1)
y = train_df['price']

In [207]:
X.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,engine_hp,engine_displacement,engine_type
0,Ford,F-150 Lariat,2018,74349,Gasoline,10-Speed,Blue,Gray,None reported,375.0,3.5,V6
1,BMW,335 i,2007,80000,Gasoline,6-Speed,Black,Black,None reported,300.0,3.0,Straight 6
2,Jaguar,XF Luxury,2009,91491,Gasoline,6-Speed,Other,Beige,None reported,300.0,4.2,8 Cylinder
3,BMW,X7 xDrive40i,2022,2437,Hybrid,Transmission,Gray,Brown,None reported,335.0,3.0,Straight 6
4,Pontiac,Firebird Base,2001,111000,Gasoline,A/T,White,Black,None reported,200.0,3.8,V6


In [208]:
y.head()

0    11000
1     8250
2    15000
3    63500
4     7850
Name: price, dtype: int64

In [209]:
X.shape

(54273, 12)

In [210]:
y.shape

(54273,)

In [218]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [219]:
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

In [220]:
numerical_features

['model_year', 'milage', 'engine_hp', 'engine_displacement']

In [221]:
categorical_features

['brand',
 'model',
 'fuel_type',
 'transmission',
 'ext_col',
 'int_col',
 'accident',
 'engine_type']

In [222]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [223]:
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [224]:
# Train the model
model.fit(X_train, y_train)

In [226]:
# Validate the model
y_pred = model.predict(X_valid)
print('Mean Absolute Error:', mean_absolute_error(y_valid, y_pred))
print('Mean Squared Error:', mean_squared_error(y_valid, y_pred))
print('R² Score:', r2_score(y_valid, y_pred))


Mean Absolute Error: 17298.80989024259
Mean Squared Error: 2868861037.774498
R² Score: 0.02369900251671253


In [227]:
test_df.columns

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'transmission',
       'ext_col', 'int_col', 'accident', 'engine_hp', 'engine_displacement',
       'engine_type'],
      dtype='object')

In [228]:
X_test = test_df

In [230]:
# Predict prices on the test set
test_predictions = model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'id': test_ids,
    'price': test_predictions
})

submission.to_csv('Test_Predictions.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!


In [233]:

# Print the current working directory to verify the location
print("Current working directory:", os.getcwd())

Current working directory: /content
