In [1]:
# importing the necessary libraries . 
import pandas as pd 
from sklearn.model_selection import train_test_split

### Data Preparation

In [2]:
# importing the data frame 
car_df = pd.read_csv('quikr_car.csv')
car_df.head(5)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [3]:
# checking for the null values in the dataset . 

car_df.isna().sum() # <-- This will give the null counts in the column

name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64

In [4]:
# checking the data types of the data

car_df.info() 

""" 
    The observations from the .info()
    1. year is in object .
    2. price is in object .
    3. kms driven also in object . 
"""

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


' \n    The observations from the .info()\n    1. year is in object .\n    2. price is in object .\n    3. kms driven also in object . \n'

In [5]:
car_df.head()
"""
    Observation :
        1. name is too long . fix : spice 0:3 words from string 
"""

'\n    Observation :\n        1. name is too long . fix : spice 0:3 words from string \n'

### Data Cleaning .

In [6]:
# Removing the null from kms_driven col and fuel_type col

new_df = car_df[car_df['kms_driven'].notna()]
new_df = car_df[car_df['fuel_type'].notna()]

car_df = new_df # <-- Updating the original dataframe  
del new_df # <-- free the space 

car_df.isna().sum() # <-- This will give the null counts in the column


name          0
company       0
year          0
Price         0
kms_driven    0
fuel_type     0
dtype: int64

In [7]:
# fixing the data-types .

# 1. fixing year column data type

new_df = car_df # <-- making the copy if the data-frame
print(new_df['year'].unique())

"""
    The column contain some strings . 
"""
new_df = new_df[new_df['year'].str.isdigit()] # <-- filtering all the numeric values
new_df['year'] = new_df['year'].astype('int') # <-- changing the data-type to int
car_df = new_df # <-- updating the dataframe

del new_df # <-- free space

car_df.head(5) # <-- getting top 5 records from the dataframe

['2007' '2006' '2018' '2014' '2015' '2012' '2013' '2016' '2010' '2017'
 '2008' '2011' '2019' '2009' '2005' '2000' '2003' '2004' '1995' '2002'
 '2001']


Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [8]:
# 2 . Fixing the Price Column

def convert_to_numeric(num : str) -> str :
    """
        aim : Function takes the string value and remove ',' from the string . 
        parameter : num(string)
        return : num(string)
    """
    num = num.split(',')
    return "".join(num)

new_df = car_df # <-- make a copy 
# print(new_df['Price'].unique()) # <-- getting all the unique values 

"""
    Observation :
        1. There is a string 'Ask For Price'
        2. There are comma's in the number Values .
"""

new_df = new_df[new_df['Price'] != 'Ask For Price'] # <-- remove string "Ask For Price"
new_df['Price'] = new_df['Price'].apply(convert_to_numeric) # <-- Removing the comman from number
new_df['Price'] = new_df['Price'].astype('int') # <-- Changing the datatype to int
car_df = new_df # <-- Updating the original Dataframe

del new_df # <-- free space 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Price'] = new_df['Price'].apply(convert_to_numeric) # <-- Removing the comman from number
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Price'] = new_df['Price'].astype('int') # <-- Changing the datatype to int


In [9]:
# 3. Fixing the kms_driven Column

def fix_kms_driven(num : str ) -> str :
    """
        aim : Function takes the string value and remove ',' from the string . 
        parameter : num(string)
        return : num(string)
    """
    num = [x if x.isdigit() else '' for x in num]
    return "".join(num)

new_df = car_df
#  print(new_df['kms_driven'].unique()) # <-- getting the unique values
"""
    Observation :
        1. kms substring attack to every row . 
        2. "," is in the string . 
"""
new_df['kms_driven'] = new_df['kms_driven'].apply(fix_kms_driven)
new_df['kms_driven'] = new_df['kms_driven'].astype('int')
car_df = new_df # <-- updating dataframe
del new_df # <-- free space 



In [10]:
# 4. Fixing the car name 

new_df = car_df # <-- copy the dataframe
new_df['name'] = new_df['name'].str.split(" ").str.slice(0,3).str.join(' ') # <-- getting the first three words from the string .
car_df = new_df # <-- updating the original dataframe
del new_df # <-- free space 

# car_df.reset_index(drop=True) # <-- reseting the index and drop the previous index 
car_df.head()


Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [11]:
# outlier detection 

car_df.describe() # <-- getting data overall description 
car_df = car_df[car_df['Price'] < 6e6] # <-- removing the outliers 


# Model Preparation 

In [12]:
car_df.to_csv('clean_data.csv') # <-- exporting the data for the backup file .

In [13]:
x = car_df.drop(columns='Price') # <-- extracting the features columns
y = car_df['Price'] # <-- extracting the target column

# printing the data . 
print(x)
print("-"*50)
print(y)

                       name   company  year  kms_driven fuel_type
0       Hyundai Santro Xing   Hyundai  2007       45000    Petrol
1       Mahindra Jeep CL550  Mahindra  2006          40    Diesel
3         Hyundai Grand i10   Hyundai  2014       28000    Petrol
4    Ford EcoSport Titanium      Ford  2014       36000    Diesel
6                 Ford Figo      Ford  2012       41000    Diesel
..                      ...       ...   ...         ...       ...
883      Maruti Suzuki Ritz    Maruti  2011       50000    Petrol
885          Tata Indica V2      Tata  2009       30000    Diesel
886    Toyota Corolla Altis    Toyota  2009      132000    Petrol
888            Tata Zest XM      Tata  2018       27000    Diesel
889      Mahindra Quanto C8  Mahindra  2013       40000    Diesel

[815 rows x 5 columns]
--------------------------------------------------
0       80000
1      425000
3      325000
4      575000
6      175000
        ...  
883    270000
885    110000
886    300000
888    

In [14]:
# splitting the data into the train and test split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [15]:
# make import for the different model and performance test .  

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer # removing the nan values from the data
import numpy as np 

In [16]:
# apply the one hot encoder on the different columns . 

ohe = OneHotEncoder()
ohe.fit(x[['name','company','fuel_type']])


In [17]:
# apply the column transformer

column_tansformer = make_column_transformer((
    OneHotEncoder(
        categories=ohe.categories_ # <-- passing all the value within the data 
    ),
    ['name','company','fuel_type']
), remainder='passthrough')  # <-- means apply one hot encoder on 'name' ,
# 'company', 'fuel_type' and passthrought the rest of column and SimpleImputer Remove the nan value From the columns 


In [18]:
""" 
    We are having less data for model , picking best fit model on different train test splits .
"""

score = []

for i in range(0,1000):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(column_tansformer, lr)
    pipe.fit(x_train, y_train)

    y_pred = pipe.predict(x_test)
    score.append(r2_score(y_pred, y_test))



In [19]:
print(score[np.argmax(score)]) # <-- getting the max score and index of the max score

0.8340842588574909


In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=np.argmax(score))
lr = LinearRegression()
pipe = make_pipeline(column_tansformer, lr)
pipe.fit(x_train, y_train)

y_pred = pipe.predict(x_test)
r2_score(y_pred, y_test)

0.8340842588574909

In [21]:
# Now by using pickle we'll dump the entire model . 
import pickle

pickle.dump(pipe, open("LinearRegressionModel.pkl",'wb')) # <-- dumping the entire pipe line using the pickle library . 

In [22]:
data = [
        ["Maruti Suzuki Swift", "Maruti", 2019, 100, 'Petrol']
    ] 

pipe.predict(pd.DataFrame(data, columns=['name','company','year','kms_driven','fuel_type']))

array([438697.15090741])