In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics

Data Collection and Processing:

In [2]:
# Loading the data from .csv to pandas dataframe
car_dataset = pd.read_csv('car data.csv')

In [3]:
# Checking the first 5 rows of the dataframe:
car_dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
# Checking the number of rows and columns:
car_dataset.shape

(301, 9)

In [5]:
# Getting some information about dataset:
car_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
# Checking the number of null values:
car_dataset.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [7]:
# Checking the distribution of categorical data:
print(car_dataset.Fuel_Type.value_counts())
print(car_dataset.Seller_Type.value_counts())
print(car_dataset.Transmission.value_counts())

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64
Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64
Transmission
Manual       261
Automatic     40
Name: count, dtype: int64


Handling the Categorical data:

In [8]:
def ApplyLabelEncoder(car_dataset, col_name):
    le= LabelEncoder()
    car_dataset[col_name] = le.fit_transform(car_dataset[col_name])
    return car_dataset
car_dataset = ApplyLabelEncoder(car_dataset, 'Fuel_Type')
car_dataset = ApplyLabelEncoder(car_dataset, 'Seller_Type')
car_dataset = ApplyLabelEncoder(car_dataset, 'Transmission')

In [9]:
car_dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,2,0,1,0
1,sx4,2013,4.75,9.54,43000,1,0,1,0
2,ciaz,2017,7.25,9.85,6900,2,0,1,0
3,wagon r,2011,2.85,4.15,5200,2,0,1,0
4,swift,2014,4.6,6.87,42450,1,0,1,0


Spliting the data and the Target:

In [10]:
X = car_dataset.drop(['Car_Name', 'Selling_Price'], axis= 1)
Y = car_dataset['Selling_Price']

In [11]:
print(X)

     Year  Present_Price  Kms_Driven  Fuel_Type  Seller_Type  Transmission   
0    2014           5.59       27000          2            0             1  \
1    2013           9.54       43000          1            0             1   
2    2017           9.85        6900          2            0             1   
3    2011           4.15        5200          2            0             1   
4    2014           6.87       42450          1            0             1   
..    ...            ...         ...        ...          ...           ...   
296  2016          11.60       33988          1            0             1   
297  2015           5.90       60000          2            0             1   
298  2009          11.00       87934          2            0             1   
299  2017          12.50        9000          1            0             1   
300  2016           5.90        5464          2            0             1   

     Owner  
0        0  
1        0  
2        0  
3        0 

In [12]:
print(Y)

0       3.35
1       4.75
2       7.25
3       2.85
4       4.60
       ...  
296     9.50
297     4.00
298     3.35
299    11.50
300     5.30
Name: Selling_Price, Length: 301, dtype: float64


Spliting Training and Test data

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 2)

Model Training

In [14]:
model = RandomForestRegressor(min_samples_split=5, n_jobs= -1, oob_score = True)

In [15]:
model.fit(X_train, Y_train)

In [16]:
print('Training Accuracy:', model.score(X,Y))
print('Testing Accuracy:', model.oob_score_)

Training Accuracy: 0.9779316699215433
Testing Accuracy: 0.8929939087057032


In [17]:
model1 = GradientBoostingRegressor()
model1.fit(X_train,Y_train)

In [18]:
print("Training Accuracy:", model1.score(X_train,Y_train))
print("Testing Accuracy:", model1.score(X_test, Y_test))

Training Accuracy: 0.9956442958815374
Testing Accuracy: 0.9808777091798009


By seeing the Training and Testing Accuracy of the models we can visualize that Gradient Boosting is better model but it overfits a lot, so we will go with RandomForestRegressor.

End