In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd  
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import make_column_transformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, GridSearchCV

In [3]:
cars_df = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Used Card Price\train.csv", index_col=0)
cars_df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [4]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         188533 non-null  object
 1   model         188533 non-null  object
 2   model_year    188533 non-null  int64 
 3   milage        188533 non-null  int64 
 4   fuel_type     183450 non-null  object
 5   engine        188533 non-null  object
 6   transmission  188533 non-null  object
 7   ext_col       188533 non-null  object
 8   int_col       188533 non-null  object
 9   accident      186081 non-null  object
 10  clean_title   167114 non-null  object
 11  price         188533 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 18.7+ MB


In [5]:
cars_df.isnull().sum()

brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [6]:
print(cars_df['fuel_type'].value_counts())

fuel_type
Gasoline          165940
Hybrid              6832
E85 Flex Fuel       5406
Diesel              3955
–                    781
Plug-In Hybrid       521
not supported         15
Name: count, dtype: int64


In [7]:
print(cars_df['accident'].value_counts())

accident
None reported                             144514
At least 1 accident or damage reported     41567
Name: count, dtype: int64


In [8]:
print(len(cars_df['engine'].unique()))

1117


In [9]:
print(cars_df['transmission'].value_counts())

transmission
A/T                                                   49904
8-Speed A/T                                           20645
Transmission w/Dual Shift Mode                        19255
6-Speed A/T                                           18044
6-Speed M/T                                           11998
7-Speed A/T                                           11124
Automatic                                             10691
8-Speed Automatic                                      8431
10-Speed A/T                                           8044
9-Speed A/T                                            3866
5-Speed A/T                                            3217
10-Speed Automatic                                     3164
6-Speed Automatic                                      2799
4-Speed A/T                                            2546
5-Speed M/T                                            2409
9-Speed Automatic                                      2325
CVT Transmission           

In [10]:
print(len(cars_df['clean_title']) - cars_df['clean_title'].value_counts()['Yes'])
print(cars_df['clean_title'].value_counts())

21419
clean_title
Yes    167114
Name: count, dtype: int64


In [11]:
X_train = cars_df.drop('price', axis=1)
y_train = cars_df['price']

In [12]:
y_test = pd.read_csv(r'C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Used Card Price\sample_submission.csv')

In [13]:
ohe = OneHotEncoder(
    sparse_output=False, 
    drop='first',
    handle_unknown='infrequent_if_exist',
    min_frequency=3000
).set_output(transform='pandas')



ct_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=object)),
    (ohe, make_column_selector(dtype_include=object)),
    verbose_feature_names_out=False
).set_output(transform='pandas')

In [14]:
X_test = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Used Card Price\test.csv", index_col=0)

In [15]:
X_train_ohe = ct_ohe.fit_transform(X_train)
X_test_ohe = ct_ohe.transform(X_test)

In [16]:
X_train_ohe.isnull().sum().sum()

0

---

Linear Regression

In [17]:
lr = LinearRegression()
lr.fit(X_train_ohe, y_train)
y_pred = lr.predict(X_test_ohe)

In [18]:
y_pred[y_pred< 150] = 150

In [19]:
print('R2: ',r2_score(y_test['price'], y_pred))
print('MAE: ',mean_absolute_error(y_test['price'], y_pred))
print('MSE: ',mean_squared_error(y_test['price'], y_pred))
print('RMSE: ',root_mean_squared_error(y_test['price'], y_pred))

R2:  -1.1318248362524035e+30
MAE:  19457.81823605122
MSE:  539264771.3399001
RMSE:  23222.075086862933


---
Elastic Regression

In [20]:
en = ElasticNet()
 
kfold = KFold(n_splits=5, random_state=24, shuffle=True)
pipe = Pipeline(
    [
        ('CT', ct_ohe),
        ('EN' , en)
    ]
)

params = {
    'EN__alpha': np.linspace(0.001, 5, 3),
    'EN__l1_ratio': np.linspace(0.1, 1, 3)
}


gcv = GridSearchCV(
    pipe, 
    param_grid=params,
    scoring='r2',
    cv=kfold,
    verbose=2,
)

gcv.fit(X_train, y_train)

y_pred = gcv.best_estimator_.predict(X_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ..................EN__alpha=0.001, EN__l1_ratio=0.1; total time=   3.5s
[CV] END ..................EN__alpha=0.001, EN__l1_ratio=0.1; total time=   3.6s
[CV] END ..................EN__alpha=0.001, EN__l1_ratio=0.1; total time=   3.6s
[CV] END ..................EN__alpha=0.001, EN__l1_ratio=0.1; total time=   3.6s
[CV] END ..................EN__alpha=0.001, EN__l1_ratio=0.1; total time=   3.4s
[CV] END .................EN__alpha=0.001, EN__l1_ratio=0.55; total time=   3.4s
[CV] END .................EN__alpha=0.001, EN__l1_ratio=0.55; total time=   3.4s
[CV] END .................EN__alpha=0.001, EN__l1_ratio=0.55; total time=   3.6s
[CV] END .................EN__alpha=0.001, EN__l1_ratio=0.55; total time=   3.5s
[CV] END .................EN__alpha=0.001, EN__l1_ratio=0.55; total time=   3.6s
[CV] END ..................EN__alpha=0.001, EN__l1_ratio=1.0; total time=   3.7s
[CV] END ..................EN__alpha=0.001, EN__l

In [21]:
gcv.best_params_, gcv.best_score_

({'EN__alpha': 2.5004999999999997, 'EN__l1_ratio': 1.0}, 0.09453570194403726)

In [22]:
y_pred[y_pred< 150] = 150

In [23]:
print('R2: ',r2_score(y_test['price'], y_pred))
print('MAE: ',mean_absolute_error(y_test['price'], y_pred))
print('MSE: ',mean_squared_error(y_test['price'], y_pred))
print('RMSE: ',root_mean_squared_error(y_test['price'], y_pred))

R2:  -1.1292562662574492e+30
MAE:  19433.094424059476
MSE:  538040960.6700567
RMSE:  23195.709962621466


In [24]:
sanple_out = y_test.copy()
sanple_out['price'] = y_pred
sanple_out.to_csv('UsedCarsKnnRegression.csv', index=False)

---
KNN Regression

In [25]:
scaler_std = StandardScaler()
knnr = KNeighborsRegressor()

pipe = Pipeline(
    [
        ('CT', ct_ohe),
        ('SCL', scaler_std),
        ('KNN', knnr)
    ]
)

params = {
        'KNN__n_neighbors': np.arange(1, 50, 5),     
        # 'KNN__metric': ['cityblock','haversine', 'manhattan', 'minkowski'] # Calculate hote hote maut ho jayegi
}

gcv = GridSearchCV(
    pipe, 
    param_grid=params,
    scoring='r2',
    cv=5,
    verbose=2, 
)


gcv.fit(X_train, y_train)

# Print results for all metrics
print("\nBest Parameters:", gcv.best_params_)
print("\nBest neg_log_loss Score:", gcv.best_score_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .................................KNN__n_neighbors=1; total time=  19.0s
[CV] END .................................KNN__n_neighbors=1; total time=  18.4s
[CV] END .................................KNN__n_neighbors=1; total time=  18.4s
[CV] END .................................KNN__n_neighbors=1; total time=  18.0s
[CV] END .................................KNN__n_neighbors=1; total time=  18.4s
[CV] END .................................KNN__n_neighbors=6; total time=  18.2s
[CV] END .................................KNN__n_neighbors=6; total time=  19.1s
[CV] END .................................KNN__n_neighbors=6; total time=  18.3s
[CV] END .................................KNN__n_neighbors=6; total time=  18.6s
[CV] END .................................KNN__n_neighbors=6; total time=  19.3s
[CV] END ................................KNN__n_neighbors=11; total time=  18.9s
[CV] END ................................KNN__n_

In [26]:
y_pred = gcv.best_estimator_.predict(X_test)

In [27]:
y_pred[y_pred< 150] = 150

In [28]:
print('R2: ',r2_score(y_test['price'], y_pred))
print('MAE: ',mean_absolute_error(y_test['price'], y_pred))
print('MSE: ',mean_squared_error(y_test['price'], y_pred))
print('RMSE: ',root_mean_squared_error(y_test['price'], y_pred))

R2:  -1.495803547358748e+30
MAE:  20480.640085077503
MSE:  712684624.0684038
RMSE:  26696.153731734536
