In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.simplefilter('ignore')

In [2]:
# Loading data
df = pd.read_csv('/content/car data.csv')

In [3]:
# Preview data
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
# Checking dimension of datatse
print(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns")

Dataset has 301 rows and 9 columns


In [5]:
# Checking basic information of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
# Categorical features
cat_features = [cat for cat in df.columns if df[cat].dtype == 'O']
print(f"The dataset has total {len(cat_features)} categorical attributes")

The dataset has total 4 categorical attributes


In [7]:
# Numerical features
num_features = [num for num in df.columns if df[num].dtype != 'O']
print(f"The dataset has total {len(num_features)} numerical attributes")

The dataset has total 5 numerical attributes


In [8]:
# Descriptive statistic analysis
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,301.0,2013.627907,2.891554,2003.0,2012.0,2014.0,2016.0,2018.0
Selling_Price,301.0,4.661296,5.082812,0.1,0.9,3.6,6.0,35.0
Present_Price,301.0,7.628472,8.642584,0.32,1.2,6.4,9.9,92.6
Driven_kms,301.0,36947.20598,38886.883882,500.0,15000.0,32000.0,48767.0,500000.0
Owner,301.0,0.043189,0.247915,0.0,0.0,0.0,0.0,3.0


In [9]:
# Checking fro missing values in dataset
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

In [10]:
# Checking for duplicate values
df[df.duplicated()].sum()

Car_Name          ertigafortuner
Year                        4031
Selling_Price              30.75
Present_Price               41.4
Driven_kms                 83000
Fuel_Type           DieselDiesel
Selling_type        DealerDealer
Transmission     ManualAutomatic
Owner                          0
dtype: object

In [11]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [15]:
# Checking for unique car company
df["CompanyName"].unique()

KeyError: ignored

In [14]:
# Creating function to replace value in carname attributes
def replace(a,b):
    df['CompanyName'].replace(a,b,inplace=True)
replace('maxda','mazda')
replace('porcshce','porsche')
replace('toyouta','toyota')
replace('vokswagen','volkswagen')
replace('vw','volkswagen')

KeyError: ignored

In [None]:
# Checking for unique car company
df["CompanyName"].unique()

In [None]:
# Checking for outliers
plt.figure(figsize=(15,10))
sns.boxplot(df,palette="Set2",width=0.8,fliersize=10)
plt.title('Boxplot of features', pad=10, fontweight="black", fontsize=20)
plt.xticks(rotation=45)
plt.show()

In [None]:
z = round(df.groupby(["CompanyName"])["price"].agg(["mean"]),2).T
z

In [None]:
df = df.merge(z.T,how="left",on="CompanyName")

In [None]:
bins = [0,10000,20000,40000]
cars_bin=['Budget','Medium','Highend']
df['CarsRange'] = pd.cut(df['mean'],bins,right=False,labels=cars_bin)
df.head()

In [None]:
new_df = df[['fueltype','aspiration','doornumber','carbody','drivewheel','enginetype','cylindernumber','fuelsystem'
             ,'wheelbase','carlength','carwidth','curbweight','enginesize','boreratio','horsepower','citympg','highwaympg',
             'price','CarsRange']]

In [None]:
new_df

In [None]:
# Creating Dummies Variables for all the Categorical Features.
new_df = pd.get_dummies(columns=["fueltype","aspiration","doornumber","carbody","drivewheel","enginetype",
                                "cylindernumber","fuelsystem","CarsRange"],data=new_df)

In [None]:
new_df.head()

In [None]:
# As we know that our dataset has some outliers so we do scaling of numerical data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols = ['wheelbase','carlength','carwidth','curbweight','enginesize','boreratio','horsepower',
            'citympg','highwaympg']

new_df[num_cols] = scaler.fit_transform(new_df[num_cols])

In [None]:
new_df.head()

In [None]:
# Segregating data into independent and dependent features
X = new_df.drop(columns=["price"])
y = new_df["price"]

In [None]:
# Train test and split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# model training
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor()

In [None]:
random_forest.fit(X_train,y_train)

In [None]:
y_pred = random_forest.predict(X_test)

In [None]:
# Model Evaluation
from sklearn.metrics import r2_score
print('R2 score: ',r2_score(y_test,y_pred)*100)

In [None]:
# Model Testing
new_data = X_test.loc[100].values

In [None]:
random_forest.predict([new_data])[0]