In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
import pandas as pd
df=pd.read_csv(r"E:\Resume Machine Learning Projects\Car price prediction\notebook\cleaned_data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Manufacturer,Prod. year,Category,Leather interior,Fuel type,Mileage,Cylinders,Gear box type,Color,Airbags
0,0,13328,LEXUS,2010,Jeep,Yes,Hybrid,186005 km,6.0,Automatic,Silver,12
1,1,16621,CHEVROLET,2011,Jeep,No,Petrol,192000 km,6.0,Tiptronic,Black,8
2,2,8467,HONDA,2006,Hatchback,No,Petrol,200000 km,4.0,Variator,Black,2
3,3,3607,FORD,2011,Jeep,Yes,Hybrid,168966 km,4.0,Automatic,White,2
4,4,11726,HONDA,2014,Hatchback,Yes,Petrol,91901 km,4.0,Automatic,Silver,4


In [4]:
df.drop(columns="Unnamed: 0",inplace=True)

In [5]:
df.head()

Unnamed: 0,Price,Manufacturer,Prod. year,Category,Leather interior,Fuel type,Mileage,Cylinders,Gear box type,Color,Airbags
0,13328,LEXUS,2010,Jeep,Yes,Hybrid,186005 km,6.0,Automatic,Silver,12
1,16621,CHEVROLET,2011,Jeep,No,Petrol,192000 km,6.0,Tiptronic,Black,8
2,8467,HONDA,2006,Hatchback,No,Petrol,200000 km,4.0,Variator,Black,2
3,3607,FORD,2011,Jeep,Yes,Hybrid,168966 km,4.0,Automatic,White,2
4,11726,HONDA,2014,Hatchback,Yes,Petrol,91901 km,4.0,Automatic,Silver,4


In [6]:
numerical_cols=[ 'Prod. year', 'Cylinders', 'Airbags']
categorical_cols=['Manufacturer', 'Category', 'Leather interior', 'Fuel type', 'Mileage',
       'Gear box type', 'Color']

from sklearn.preprocessing import OneHotEncoder


num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("OneHot_Encoder",OneHotEncoder(handle_unknown="ignore")),
        ("scaler",StandardScaler(with_mean=False))

    ]
)

preprocessor=ColumnTransformer(
    [
        ("num_pipeline",num_pipeline,numerical_cols),
        ("cat_pipeline",cat_pipeline,categorical_cols)
    ]
)

In [7]:
preprocessor

In [8]:
X=df.drop(columns="Price",axis=1)
Y=df["Price"]

In [9]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [10]:
print(X.shape,x_train.shape,x_test.shape)

(16294, 10) (13035, 10) (3259, 10)


In [11]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)


In [12]:
x_train.shape

(13035, 5888)

In [13]:
models={
    "LinearRegression":LinearRegression(),
    # "SVR":SVR(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
     "GradientBoostingRegressor":GradientBoostingRegressor(),
     "AdaBoostRegressor":AdaBoostRegressor()
}

In [14]:
def evalute_model(true,predict):
    mse=mean_squared_error(true,predict)
    mae=mean_absolute_error(true,predict)
    score=r2_score(true,predict)

    return mse,mae,score

In [None]:
r2_list=[]
model_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    

    y_pred_test=model.predict(x_test)
    y_pred_train=model.predict(x_train)



    test_mse,test_mae,test_r2=evalute_model(y_test,y_pred_test)
    train_mse,train_mae,train_r2=evalute_model(y_train,y_pred_train)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(train_mse))
    print("- Mean Absolute Error: {:.4f}".format(train_mae))
    print("- R2 Score: {:.4f}".format(train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(test_mse))
    print("- Mean Absolute Error: {:.4f}".format(test_mae))
    print("- R2 Score: {:.4f}".format(test_r2))
    r2_list.append(test_r2)
    
    print('='*35)
    print('\n')

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 44519994.9697
- Mean Absolute Error: 3259.9189
- R2 Score: 0.8051
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 133254813.0396
- Mean Absolute Error: 7186.8133
- R2 Score: 0.4460


KNeighborsRegressor
Model performance for Training set
- Root Mean Squared Error: 118711836.0917
- Mean Absolute Error: 7363.7699
- R2 Score: 0.4804
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 192251017.0697
- Mean Absolute Error: 9411.4692
- R2 Score: 0.2008


DecisionTreeRegressor
Model performance for Training set
- Root Mean Squared Error: 2371453.7455
- Mean Absolute Error: 131.4325
- R2 Score: 0.9896
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 105326857.3713
- Mean Absolute Error: 5488.4085
- R2 Score: 0.5621




In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Define the model
model = RandomForestRegressor()

# Define hyperparameter grid
params = {
    "n_estimators": [50, 100, 200],  # Number of trees
    "max_depth": [None, 10, 20, 30],  # Depth of trees
    "min_samples_split": [2, 5, 10],  # Minimum samples required to split
    "min_samples_leaf": [1, 2, 4]  # Minimum samples per leaf
}

# Define RandomizedSearchCV
random = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    cv=5,  # 5-fold cross-validation
    n_iter=10,  # Number of parameter combinations to try
    n_jobs=-1,  # Use all available CPU cores for parallel computation
    verbose=2,  # Prints progress
    random_state=42  # For reproducibility
)

# Fit the model
random.fit(x_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
