## Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Load Dataset

In [2]:
data = pd.read_csv(r'C:\Users\User\PROJECTS\car price prediction\data\car_price_dataset.csv')
data.head()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867


In [3]:
data.tail()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
9995,Kia,Optima,2004,3.7,Diesel,Semi-Automatic,5794,2,4,8884
9996,Chevrolet,Impala,2002,1.4,Electric,Automatic,168000,2,1,6240
9997,BMW,3 Series,2010,3.0,Petrol,Automatic,86664,5,1,9866
9998,Ford,Explorer,2002,1.4,Hybrid,Automatic,225772,4,1,4084
9999,Volkswagen,Tiguan,2001,2.1,Diesel,Manual,157882,3,3,3342


## Splitting of Data

- Data is split at first itself so that we get test data purely untouched same like dealing with real data

In [None]:
X = data.drop('Price', axis=1)
y = data['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df = X_train.copy()
df['Price'] = y_train

## Remove Unwanted Columns

- In this project, we don't have any unnecessory columns. So we leave them as they are.

## Handle Missing Values

In [5]:
for col in df.columns:
    num_null = df.loc[:, col].isnull().sum()
    perc_null = (num_null / df.shape[0]) * 100
    print('{} - {} - %{}'.format(col, num_null, perc_null))

Brand - 0 - %0.0
Model - 0 - %0.0
Year - 0 - %0.0
Engine_Size - 0 - %0.0
Fuel_Type - 0 - %0.0
Transmission - 0 - %0.0
Mileage - 0 - %0.0
Doors - 0 - %0.0
Owner_Count - 0 - %0.0
Price - 0 - %0.0


- The result shows there is no missing values in our data

## Handle Duplicate Rows

In [6]:
duplicate_rows = df.duplicated()

if duplicate_rows.any():
    print('Duplicate present')
else:
    print('No Duplicates')

No Duplicates


## Numerical Sanity Check

-- Select all the columns which are numerical datatypes

In [7]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
df[numerical_columns].describe()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,2011.520375,2.996887,149477.00325,3.503625,2.991875,8828.74525
std,6.897792,1.153237,86613.284937,1.111482,1.420849,3132.164285
min,2000.0,1.0,25.0,2.0,1.0,2000.0
25%,2006.0,2.0,74192.25,3.0,2.0,6604.75
50%,2012.0,3.0,150069.5,4.0,3.0,8829.0
75%,2017.0,4.0,224292.75,4.0,4.0,11071.0
max,2023.0,5.0,299947.0,5.0,5.0,18301.0


## Categorical Sanity Check

-- select all the columns which are categorical datatypes

In [9]:
categorical_columns = df.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

In [10]:
df[categorical_columns].describe(include='O')

Unnamed: 0,Brand,Model,Fuel_Type,Transmission
count,8000,8000,8000,8000
unique,10,30,4,3
top,Audi,Accord,Electric,Manual
freq,851,302,2074,2707


-- now we must make sure the possible categories for each of them

In [11]:
for col in categorical_columns:
    values = df[col].unique()
    print(col, values)
    print('----------')

Brand ['Volkswagen' 'BMW' 'Hyundai' 'Honda' 'Mercedes' 'Audi' 'Ford' 'Kia'
 'Chevrolet' 'Toyota']
----------
Model ['Golf' '5 Series' 'Sonata' 'CR-V' 'E-Class' 'GLA' 'Tiguan' 'A3' 'Focus'
 'Civic' 'Sportage' 'A4' 'Fiesta' 'X5' 'Malibu' 'Explorer' '3 Series' 'Q5'
 'Elantra' 'Camry' 'C-Class' 'Passat' 'Impala' 'Accord' 'Equinox' 'Optima'
 'Rio' 'Tucson' 'Corolla' 'RAV4']
----------
Fuel_Type ['Hybrid' 'Electric' 'Diesel' 'Petrol']
----------
Transmission ['Semi-Automatic' 'Automatic' 'Manual']
----------


## Checkpoint

In [12]:
%store df
%store X_test
%store y_test
%store numerical_columns
%store categorical_columns

Stored 'df' (DataFrame)
Stored 'X_test' (DataFrame)
Stored 'y_test' (Series)
Stored 'numerical_columns' (list)
Stored 'categorical_columns' (list)
