In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('Cars.csv')

# Data Cleaning and Preprocessing

## Handle missing values
df.dropna(subset=['seats'], inplace=True)

## Clean 'mileage' column
df['mileage'] = df['mileage'].str.replace(' kmpl', '').str.replace(' km/kg', '').astype(float)

## Clean 'engine' column
df['engine'] = df['engine'].str.replace(' CC', '').astype(float)

## Clean 'max_power' column
df['max_power'] = df['max_power'].str.replace(' bhp', '')
df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')
df.dropna(subset=['max_power'], inplace=True)

## Convert 'owner' column to numerical
owner_mapping = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 0
}
df['owner'] = df['owner'].map(owner_mapping)

## Convert categorical columns to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission'], drop_first=True)

## Define features and target
features = ['year', 'km_driven', 'owner', 'mileage', 'engine', 'max_power', 'seats',
            'fuel_Diesel', 'fuel_Petrol', 'fuel_LPG',
            'seller_type_Individual', 'seller_type_Trustmark Dealer',
            'transmission_Manual']
target = 'selling_price'

# Handle missing columns in one-hot encoding
for feature in ['fuel_Diesel', 'fuel_Petrol', 'fuel_LPG']:
    if feature not in df.columns:
        df[feature] = 0

X = df[features]
y = np.log(df[target])

# Model Training (example with Linear Regression)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2}')


R^2 Score: 0.8584464768106042
