<a href="https://colab.research.google.com/github/saanidhi-git/OIBSIP_datascience_3/blob/car-colab-notebook/Car_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

#  1: UPLOAD AND UNZIP PROJECT FILES

from google.colab import files
import os

# 1. Define folder name and expected CSV path
FOLDER_NAME = 'car prediction' # Must match the folder created by your zip
ZIP_FILE = 'car prediction.zip'
DATASET_PATH = f'{FOLDER_NAME}/car data.csv'

# 2. Upload the zip file
print(f"Please upload your '{ZIP_FILE}' file now:")
uploaded = files.upload()

# 3. Unzip the file
if ZIP_FILE in uploaded:
    print(f"\nUnzipping the project folder '{ZIP_FILE}'...")
    # Use -o option to overwrite existing files if you re-run
    !unzip -o "{ZIP_FILE}"
else:
    print(f"\nERROR: Did not find '{ZIP_FILE}' in uploaded files.")
    print("Please ensure the zip file is uploaded correctly before proceeding.")

# 4. Verify file path (Optional)
print(f"\nVerifying if 'car data.csv' is at: {DATASET_PATH}")
if os.path.exists(DATASET_PATH):
    print("Success! File found.")
else:
    print("ERROR: File not found at the expected path. Check your zip contents.")

Please upload your 'car prediction.zip' file now:


Saving car prediction.zip to car prediction.zip

Unzipping the project folder 'car prediction.zip'...
Archive:  car prediction.zip
   creating: car prediction/
  inflating: car prediction/app.py   
  inflating: car prediction/car data.csv  
  inflating: car prediction/car_prediction.py  
  inflating: car prediction/car_price_model.pkl  

Verifying if 'car data.csv' is at: car prediction/car data.csv
Success! File found.


In [2]:

#  2: IMPORTS AND SETUP


!pip install joblib -qq

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

DATASET_PATH = 'car prediction/car data.csv'

print("All dependencies are ready!")

All dependencies are ready!


In [3]:

# CELL 3: PIPELINE EXECUTION (Load, Transform, Train)

try:
    df = pd.read_csv(DATASET_PATH)
except FileNotFoundError:
    print(f"CRITICAL ERROR: File not found at {DATASET_PATH}. Please check Cell 1.")
    exit()

print("✅ Data Loaded Successfully\n")
print("Shape of dataset:", df.shape)
print("\nFirst 8 rows of data:\n", df.head(8))
print("\nDescription of data:\n", df.describe(include='all'))
print("\nMissing values in each column:\n", df.isnull().sum())


#  DEFINE FEATURES (X) AND TARGET (y)
X = df.drop(columns=['Car_Name', 'Selling_Price'])
y = df['Selling_Price']
print("\nX shape:", X.shape, "| y shape:", y.shape)


#  SPLIT INTO TRAINING AND TESTING DATA
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"\nTrain set: {X_train.shape}, Test set: {X_test.shape}")


#  DEFINE CATEGORICAL & NUMERICAL COLUMNS
categorical_cols = ['Fuel_Type', 'Selling_type', 'Transmission']
numeric_cols = ['Year', 'Present_Price', 'Driven_kms', 'Owner']


#  CREATE AND COMBINE TRANSFORMERS (Preprocessor)
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])


#  DEFINE MODEL AND CREATE THE PIPELINE
model = RandomForestRegressor(random_state=42)
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])


# TRAIN (FIT) THE MODEL
pipe.fit(X_train, y_train)
print("\n🚀 Model training complete!")


#  SAVE TRAINED MODEL
MODEL_FILE_PATH = f'{FOLDER_NAME}/car_price_model.pkl'
joblib.dump(pipe, MODEL_FILE_PATH)
print(f"\n💾 Model saved successfully as '{MODEL_FILE_PATH}'!")

✅ Data Loaded Successfully

Shape of dataset: (301, 9)

First 8 rows of data:
         Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0           ritz  2014           3.35           5.59       27000    Petrol   
1            sx4  2013           4.75           9.54       43000    Diesel   
2           ciaz  2017           7.25           9.85        6900    Petrol   
3        wagon r  2011           2.85           4.15        5200    Petrol   
4          swift  2014           4.60           6.87       42450    Diesel   
5  vitara brezza  2018           9.25           9.83        2071    Diesel   
6           ciaz  2015           6.75           8.12       18796    Petrol   
7        s cross  2015           6.50           8.61       33429    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      

In [4]:

# 4: EVALUATION AND FEATURE IMPORTANCE

# MAKE PREDICTIONS
y_pred = pipe.predict(X_test)


#  EVALUATE THE MODEL
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

print("✅ Model Evaluation Results:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} (Error in thousands of Rupees)")
print(f"R² Score: {r2:.2f} (Closeness of fit to the data)")


#  CHECK FEATURE IMPORTANCE
#  Get feature names from the OneHotEncoder step
# We access the named_transformers_ in the preprocessor step of the pipeline
ohe_features = pipe.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_features = list(ohe_features) + numeric_cols

# Get importances from the model step
importances = pipe.named_steps['model'].feature_importances_

feature_importances = pd.DataFrame({
    'Feature': all_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\n📊 Top 10 Important Features Affecting Car Price:")
print(feature_importances.head(10))

✅ Model Evaluation Results:
Root Mean Squared Error (RMSE): 0.93 (Error in thousands of Rupees)
R² Score: 0.96 (Closeness of fit to the data)

📊 Top 10 Important Features Affecting Car Price:
                    Feature  Importance
8             Present_Price    0.876006
7                      Year    0.056619
9                Driven_kms    0.039627
6       Transmission_Manual    0.008432
5    Transmission_Automatic    0.008201
1          Fuel_Type_Diesel    0.004780
4   Selling_type_Individual    0.002107
2          Fuel_Type_Petrol    0.001980
3       Selling_type_Dealer    0.001459
10                    Owner    0.000768
