In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

sns.set()
warnings.filterwarnings("ignore")

In [2]:
# Reading the dataset
data = pd.read_csv("/content/Data_Test.xlsx - Sheet1.csv")

In [3]:
# Checking for the shape of the data
data.shape

(1234, 12)

In [4]:
# Understand the data
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Mercedes-Benz SLC 43 AMG,Coimbatore,2018,18338,Petrol,Automatic,First,19.0 kmpl,2996 CC,362.07 bhp,2.0,1.06 Cr
1,BMW 7 Series 730Ld DPE Signature,Chennai,2017,16000,Diesel,Automatic,First,16.77 kmpl,2993 CC,261.49 bhp,5.0,1.58 Cr
2,Volkswagen Ameo 1.5 TDI Highline,Coimbatore,2017,23389,Diesel,Manual,First,21.66 kmpl,1498 CC,108.62 bhp,5.0,10.13 Lakh
3,Volkswagen Vento 1.6 Trendline,Mumbai,2011,34948,Petrol,Manual,First,16.09 kmpl,1598 CC,103.5 bhp,5.0,10.36 Lakh
4,Mahindra Bolero Power Plus ZLX,Coimbatore,2018,19764,Diesel,Manual,First,16.5 kmpl,1493 CC,70 bhp,7.0,10.51 Lakh


In [5]:
# Check for null values
data.isna().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 0
Engine                 10
Power                  10
Seats                  11
New_Price            1052
dtype: int64

In [6]:
# Take actions on null values
data.drop(columns=['New_Price'], inplace=True)
data.dropna(inplace=True)

**Extract Numeric Values** from Mileage, Engine & Power Colns

In [None]:
def extract_numeric_value(value: str) -> float:
    try:
        strRep = str(value)
        floatRep = ""
        for char in strRep:
            if not char.isalpha() and not char.isspace() and char != '/':
                floatRep += char

        return float(floatRep)
    except:
        return None

In [None]:
X['Mileage'] = X['Mileage'].apply(extract_numeric_value)

In [None]:
X['Mileage'].head(3)

0    26.60
1    19.67
2    18.20
Name: Mileage, dtype: float64

In [None]:
X['Engine'] = X['Engine'].apply(extract_numeric_value)

In [None]:
X['Engine'].head(3)

0     998.0
1    1582.0
2    1199.0
Name: Engine, dtype: float64

In [None]:
X['Power'] = X['Power'].apply(extract_numeric_value)

In [None]:
X['Power'].head(3)

0     58.16
1    126.20
2     88.70
Name: Power, dtype: float64

In [None]:
X.isna().sum()

Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                0
Engine                 0
Power                103
Seats                  0
dtype: int64

In [None]:
# filling the null values in `power` coln
X['Power'] = X['Power'].fillna(X['Power'].median())
X.isna().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
dtype: int64

In [None]:
# filling the null values in `engine` coln
X['Engine'] = X['Engine'].fillna(X['Engine'].median())
X.isna().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
dtype: int64

In [None]:
# filling the null values in `mileage` coln
X['Mileage'] = X['Mileage'].fillna(X['Mileage'].median())
X.isna().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
dtype: int64

###**Time to build Models**

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4780, 10), (1195, 10), (4780,), (1195,))

In [None]:
# Creating ColumnTransformer to encode and scale values of data
clf1 = ColumnTransformer([
    ('encode', OneHotEncoder(drop="first", sparse_output=True, handle_unknown="ignore"), ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type']),
    ('scaling', StandardScaler(), ['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats'])
], remainder="passthrough")

Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Using LinearRegression
clf2 = LinearRegression()

In [None]:
pipe = Pipeline([
    ('ColumnTransformer', clf1),
    ('Model', clf2)
])

In [None]:
pipe.fit(X, y)

In [None]:
pipe.named_steps

{'ColumnTransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('encode',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore'),
                                  ['Location', 'Fuel_Type', 'Transmission',
                                   'Owner_Type']),
                                 ('scaling', StandardScaler(),
                                  ['Year', 'Kilometers_Driven', 'Mileage',
                                   'Engine', 'Power', 'Seats'])]),
 'Model': LinearRegression()}

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
print(f"The r2_score by LinearRegression Model is {r2_score(y_test, y_pred)}")
print(f"The Mean Squared Error by LinearRegression Model is {mse(y_test, y_pred)}")

The r2_score by LinearRegression Model is 0.6415590180270132
The Mean Squared Error by LinearRegression Model is 52.180657998160285


**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf3 = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
pipe2 = Pipeline([
    ('clf1', clf1),
    ('clf3', clf3)
])

In [None]:
pipe2.fit(X_train, y_train)

In [None]:
y_pred = pipe2.predict(X_test)

In [None]:
print(f"The r2_score by Random Forest Regressor Model is {r2_score(y_test, y_pred)}")
print(f"The Mean Squared Error by Random Forest Regressor Model is {mse(y_test, y_pred)}")

The r2_score by Random Forest Regressor Model is 0.8403879723043517
The Mean Squared Error by Random Forest Regressor Model is 23.235793473546472


**SGDReressor**

In [None]:
from sklearn.linear_model import SGDRegressor

clf4 = SGDRegressor(max_iter=500, random_state=42)

In [None]:
pipe3 = Pipeline([
    ('clf1', clf1),
    ('clf4', clf4)
])

In [None]:
pipe3.fit(X_train, y_train)

In [None]:
y_pred = pipe3.predict(X_test)

In [None]:
print(f"The r2_score by SGDRegressor Model is {r2_score(y_test, y_pred)}")
print(f"The Mean Squared Error by SGDRegressor Model is {mse(y_test, y_pred)}")

The r2_score by SGDRegressor Model is 0.42923550588988835
The Mean Squared Error by SGDRegressor Model is 83.09001582552642
