In [27]:
# Step 1: Import libraries
import pandas as pd

# Step 2: Load the CSV file
df = pd.read_csv("cars.csv")  # Make sure the CSV is in the same folder, or provide full path

# Step 3: Check the first few rows
print(df.head())

# Step 4: Check for missing values
print(df.isnull().sum())


   CarName   Brand    Year  Mileage      Price
0    Civic   Honda  2018.0  45000.0  2200000.0
1  Corolla  Toyota  2019.0  38000.0  2400000.0
2     City   Honda  2017.0  60000.0  1800000.0
3   Accord   Honda  2020.0  30000.0  3500000.0
4    Camry  Toyota  2018.0  50000.0  3300000.0
CarName    0
Brand      0
Year       2
Mileage    2
Price      2
dtype: int64


In [28]:
# Step 1: Remove duplicate rows
df = df.drop_duplicates()

# Step 2: Handle missing values
# Option 1: Fill missing numerical values with median
df['Year'] = df['Year'].fillna(df['Year'].median())
df['Mileage'] = df['Mileage'].fillna(df['Mileage'].median())
df['Price'] = df['Price'].fillna(df['Price'].median())

# Step 3: Convert categorical columns to numeric using one-hot encoding
df = pd.get_dummies(df, columns=['CarName', 'Brand'], drop_first=True)

# Step 4: Check the cleaned dataset
print(df.head())
print(df.isnull().sum())


     Year  Mileage      Price  CarName_Alto  CarName_Camry  CarName_City  \
0  2018.0  45000.0  2200000.0         False          False         False   
1  2019.0  38000.0  2400000.0         False          False         False   
2  2017.0  60000.0  1800000.0         False          False          True   
3  2020.0  30000.0  3500000.0         False          False         False   
4  2018.0  50000.0  3300000.0         False           True         False   

   CarName_Civic  CarName_Corolla  CarName_Elantra  CarName_Mehran  \
0           True            False            False           False   
1          False             True            False           False   
2          False            False            False           False   
3          False            False            False           False   
4          False            False            False           False   

   CarName_Sonata  CarName_Swift  Brand_Hyundai  Brand_Suzuki  Brand_Toyota  
0           False          False          Fa

In [29]:
# Convert boolean columns to 0 and 1
df = df.astype(int)

# Check the first few rows
print(df.head())


   Year  Mileage    Price  CarName_Alto  CarName_Camry  CarName_City  \
0  2018    45000  2200000             0              0             0   
1  2019    38000  2400000             0              0             0   
2  2017    60000  1800000             0              0             1   
3  2020    30000  3500000             0              0             0   
4  2018    50000  3300000             0              1             0   

   CarName_Civic  CarName_Corolla  CarName_Elantra  CarName_Mehran  \
0              1                0                0               0   
1              0                1                0               0   
2              0                0                0               0   
3              0                0                0               0   
4              0                0                0               0   

   CarName_Sonata  CarName_Swift  Brand_Hyundai  Brand_Suzuki  Brand_Toyota  
0               0              0              0             0       

In [30]:
# Step 1: Import libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 2: Split dataset into features and target
X = df.drop('Price', axis=1)  # Features
y = df['Price']               # Target

# Step 3: Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 4: Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Predict on test set
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Step 7: Example: predict price for first row in test set
print(f"Predicted Price: {model.predict([X_test.iloc[0]])[0]}")
print(f"Actual Price: {y_test.iloc[0]}")


Mean Squared Error: 95662671918.76831
R^2 Score: 0.9267296455799977
Predicted Price: 1595795.8474308252
Actual Price: 1500000




In [31]:
import joblib

# Save the model
joblib.dump(model, "car_price_model.pkl")

# Save the columns used in training (important for input consistency)
joblib.dump(X.columns.tolist(), "model_columns.pkl")


['model_columns.pkl']