In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
def handle_bad_lines(line):
    #print(f"Skipping bad line: {line}")
    # return None to skip the bad line
    return None


car_prices= pd.read_csv('car_prices.csv', engine='python', on_bad_lines=handle_bad_lines)
car_prices.drop(columns=['vin','saledate','seller','state','model','trim'], inplace=True)
print(car_prices.head())

   year   make   body transmission  condition  odometer  color interior   
0  2015    Kia    SUV    automatic        5.0   16639.0  white    black  \
1  2015    Kia    SUV    automatic        5.0    9393.0  white    beige   
2  2014    BMW  Sedan    automatic        4.5    1331.0   gray    black   
3  2015  Volvo  Sedan    automatic        4.1   14282.0  white    black   
4  2014    BMW  Sedan    automatic        4.3    2641.0   gray    black   

     mmr  sellingprice  
0  20500         21500  
1  20800         21500  
2  31900         30000  
3  27500         27750  
4  66000         67000  


In [3]:
car_prices.dropna(inplace=True)
print(car_prices.shape[0])

472419


In [4]:
car_prices_encoded = pd.get_dummies(car_prices, columns=['make', 'body', 'transmission', 'color', 'interior'], drop_first=True)

In [5]:
X = car_prices_encoded.drop('sellingprice', axis=1)
y = car_prices_encoded['sellingprice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
lin_reg = LinearRegression()

In [7]:
lin_reg.fit(X_train, y_train)

In [8]:
y_pred = lin_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 2323280.894261018
R-squared: 0.9744840908374447


In [9]:
test_data = car_prices_encoded.sample(n=10, random_state=100)
X_test = test_data.drop('sellingprice',axis=1)
y_pred = lin_reg.predict(X_test)
y_true=test_data['sellingprice']
print(test_data)

        year  condition  odometer    mmr  sellingprice  make_Aston Martin   
246     2013        4.1   29731.0  22100         21000              False  \
267869  2012        4.1   12800.0   9100          8100              False   
24267   2014        3.9   29360.0  12600         12500              False   
481962  2013        2.9   46950.0   8825          8700              False   
20358   2006        2.8  146696.0  14300         10500              False   
259852  2006        2.2  131034.0   2000          1900              False   
141934  2013        2.0   32019.0  11700          9600              False   
351650  2013        4.8   12749.0  13050         12400              False   
349308  2012        3.4   54272.0  25200         22600              False   
122220  2013        3.2   25705.0   8550          8400              False   

        make_Audi  make_BMW  make_Bentley  make_Buick  ...  interior_green   
246         False     False         False       False  ...           False

In [14]:
for i in range(len(y_true)):
    print(f"Predicted selling price: {y_pred[i]:.2f}")
    print(f"Actual selling price:    {y_true.iloc[i]}")

Predicted selling price: 22153.01
Actual selling price:    21000
Predicted selling price: 9623.92
Actual selling price:    8100
Predicted selling price: 12393.22
Actual selling price:    12500
Predicted selling price: 7997.72
Actual selling price:    8700
Predicted selling price: 13915.01
Actual selling price:    10500
Predicted selling price: 1377.32
Actual selling price:    1900
Predicted selling price: 10083.98
Actual selling price:    9600
Predicted selling price: 13898.08
Actual selling price:    12400
Predicted selling price: 24447.82
Actual selling price:    22600
Predicted selling price: 8165.15
Actual selling price:    8400
