In [3]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('CarPrice_Assignment.csv')  # Rename file if needed

# Check basic info
df.info()
df.describe()

# Preprocessing steps
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Example: Extract company name from car name
df['CarCompany'] = df['CarName'].apply(lambda x: x.split()[0].lower())

# Encode categorical columns
df = pd.get_dummies(df, drop_first=True)

# Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = df.drop('price', axis=1)
X_scaled = scaler.fit_transform(X)
y = df['price']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model


In [7]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

results = []
for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    results.append({
        "Model": name,
        "R2 Score": r2_score(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values(by="R2 Score", ascending=False))


                      Model      R2 Score           MSE           MAE
2             Random Forest  9.512947e-01  3.844992e+06  1.394482e+03
3         Gradient Boosting  9.342308e-01  5.192085e+06  1.647032e+03
1             Decision Tree  8.591294e-01  1.112089e+07  2.038711e+03
4  Support Vector Regressor -1.020652e-01  8.700146e+07  5.707064e+03
0         Linear Regression -3.499440e+27  2.762598e+35  3.186273e+17


In [None]:
# For Random Forest
import matplotlib.pyplot as plt
import seaborn as sns

importances = trained_models['Random Forest'].feature_importances_
features = X.columns
feat_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feat_importance_df.head(10), x='Importance', y='Feature')
plt.title("Top 10 Important Features Affecting Car Price")
plt.show()
