#### Importing dependencies

In [125]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics

#### Data Collection and Processing

In [126]:
car_data = pd.read_csv("C:/Users/USER/Desktop/Datasets/cars_data.csv")

# renaming column names
car_data = car_data.rename(columns={
    'fuel': 'fuel_type',
    'transmission': 'transmission_type',
    'owner': 'ownership_type'
})

car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel_type,seller_type,transmission_type,ownership_type
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [127]:
#Checking the number of rows and columns
car_data.shape

(4340, 8)

In [128]:
#Checking for missing values and datatypes
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               4340 non-null   object
 1   year               4340 non-null   int64 
 2   selling_price      4340 non-null   int64 
 3   km_driven          4340 non-null   int64 
 4   fuel_type          4340 non-null   object
 5   seller_type        4340 non-null   object
 6   transmission_type  4340 non-null   object
 7   ownership_type     4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


##### Checking categories distribution

In [129]:
print("",car_data.fuel_type.value_counts())

 fuel_type
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64


In [130]:
print("",car_data.seller_type.value_counts())

 seller_type
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: count, dtype: int64


In [131]:
print("",car_data.transmission_type.value_counts())

 transmission_type
Manual       3892
Automatic     448
Name: count, dtype: int64


In [132]:
print("",car_data.ownership_type.value_counts())

 ownership_type
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64


##### Encoding the Categorical data 

In [133]:
# encoding the fuel type 
car_data.replace(
    {'fuel_type': 
         {'Diesel':0,
          'Petrol':1,
          'CNG':2,
          'LPG':3,
          'Electric':4
     }
    }, inplace=True)

# encoding the seller type 
car_data.replace(
    {'seller_type': 
         {'Individual':0, 
          'Dealer':1, 
          'Trustmark Dealer':2
     }
    }, inplace=True)

# encoding the transmission type 
car_data.replace(
    {'transmission_type':
        {'Manual':0,
         'Automatic':1
      }
    }, inplace=True)

# encoding the transmission type 
car_data.replace({
    'ownership_type': {
        'First Owner': 0,
        'Second Owner': 1,
        'Third Owner': 2,
        'Fourth & Above Owner': 3,
        'Test Drive Car': 4
    }
}, inplace=True)


# Scale down selling_price by dividing by 10,000 (e.g., from dollars to units of $1000)
car_data['selling_price'] = car_data['selling_price'] / 10000
# Ensure the column is of float type for modeling precision
car_data['selling_price'] = car_data['selling_price'].astype(float)

# Explicitly convert to integer
car_data['fuel_type'] = car_data['fuel_type'].astype('int64')
car_data['seller_type'] = car_data['seller_type'].astype('int64') 
car_data['transmission_type'] = car_data['transmission_type'].astype('int64')
car_data['transmission_type'] = car_data['ownership_type'].astype('int64')

In [134]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               4340 non-null   object 
 1   year               4340 non-null   int64  
 2   selling_price      4340 non-null   float64
 3   km_driven          4340 non-null   int64  
 4   fuel_type          4340 non-null   int64  
 5   seller_type        4340 non-null   int64  
 6   transmission_type  4340 non-null   int64  
 7   ownership_type     4340 non-null   object 
dtypes: float64(1), int64(5), object(2)
memory usage: 271.4+ KB


In [135]:
car_data.head()
print(car_data['selling_price'].min())

2.0


#### Splitting the data and label

In [136]:
X = car_data.drop(['name', 'selling_price'], axis=1)
Y = car_data['selling_price']

#### Splitting the data into Training and Test data

In [137]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=13)

####  1. Linear Regression

In [138]:
lin_reg_model = LinearRegression()

In [139]:
lin_reg_model.fit(X_train, Y_train)

#### Model Evaluation

In [140]:
# Prediction on training data 
training_data_prediction = lin_reg_model.predict(X_train)

In [141]:
# R2 error
R2_score = metrics.r2_score(Y_train, training_data_prediction)
# RMSE error
RMSE_score = metrics.root_mean_squared_error(Y_train, training_data_prediction)

In [142]:
print('R squared error :', R2_score)
print('Root mean squared error :', RMSE_score)

R squared error : 0.25009436017004993
Root mean squared error : 50.737492381486106
