# CAR PRICE PREDICTION WITH MACHINE LEARNING

## Importing Libraries

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Loading Dataset

In [27]:
#Load the dataset
data = pd.read_csv("C:/B.Tech/V SEM/OASIS internship/Task-3/archive (10)/car data.csv")

## EDA

In [28]:
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [30]:
data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.642584,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [32]:
data['Car_Name'].value_counts()

city                        26
corolla altis               16
verna                       14
fortuner                    11
brio                        10
                            ..
Honda CB Trigger             1
Yamaha FZ S                  1
Bajaj Pulsar 135 LS          1
Activa 4g                    1
Bajaj Avenger Street 220     1
Name: Car_Name, Length: 98, dtype: int64

In [33]:
data['Fuel_Type'].value_counts()

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64

In [15]:
data['Transmission'].value_counts()

Manual       261
Automatic     40
Name: Transmission, dtype: int64

In [35]:
data['Owner'].value_counts()

0    290
1     10
3      1
Name: Owner, dtype: int64

In [34]:
data['Year'].value_counts()

2015    61
2016    50
2014    38
2017    35
2013    33
2012    23
2011    19
2010    15
2008     7
2009     6
2006     4
2005     4
2003     2
2007     2
2018     1
2004     1
Name: Year, dtype: int64

In [20]:
data['Selling_type'].value_counts()

Dealer        195
Individual    106
Name: Selling_type, dtype: int64

In [36]:
data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

## Data preprocessing

In [37]:
# Encode categorical variables (e.g., Fuel_Type, Transmission)
label_encoder = LabelEncoder()
data['Fuel_Type'] = label_encoder.fit_transform(data['Fuel_Type'])
data['Transmission'] = label_encoder.fit_transform(data['Transmission'])

In [38]:
# Handle missing values (if any)
data.dropna(inplace=True)

In [39]:
# Split the dataset into features (X) and target variable (y)
X = data[['Year', 'Present_Price', 'Driven_kms', 'Fuel_Type', 'Transmission', 'Owner']]
y = data['Selling_Price']


In [42]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:

# Choose a regression algorithm (Linear Regression in this example)
model = LinearRegression()

In [43]:
# Train the model
model.fit(X_train, y_train)

In [44]:
# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [45]:
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error: 1.2974909172280398
Mean Squared Error: 3.5539292843772103
R-squared: 0.8457200301886487


In [74]:
# Make predictions for new data points
new_data = pd.DataFrame({
    'Year': [2006],
    'Present_Price': [10.0],
    'Driven_kms': [50000],
    'Fuel_Type': ['Diesel'],  # Use the same label encoding as above
    'Transmission': ['Manual'],  # Use the same label encoding as above
    'Owner': [1]
})

In [75]:
# One-hot encode categorical variables
new_data = pd.get_dummies(new_data, columns=['Fuel_Type', 'Transmission'])

# Ensure that the new_data DataFrame has the same columns as the training data
# You may need to add missing columns if necessary.
# For example, if 'Fuel_Type_Diesel' and 'Fuel_Type_CNG' are missing:
if 'Fuel_Type_Diesel' not in new_data.columns:
    new_data['Fuel_Type_Diesel'] = 0
if 'Fuel_Type_CNG' not in new_data.columns:
    new_data['Fuel_Type_CNG'] = 0


In [76]:
# Ensure 'new_data' includes all one-hot encoded columns from training data
expected_columns = set(X.columns)
missing_columns = expected_columns - set(new_data.columns)

# Add missing columns and set them to 0
for col in missing_columns:
    new_data[col] = 0

# Ensure the order of columns matches the training data
new_data = new_data[X.columns]

# Now you can make predictions
predicted_price = model.predict(new_data)
print(f'Predicted Price: {predicted_price[0]}')


Predicted Price: 5.6533316633339155
