
This dataset contains information about used cars.
This data can be used for a lot of purposes such as price prediction to exemplify the use of linear regression in Machine Learning.
The columns in the given dataset are as follows:

* name
* year
* selling_price
* km_driven
* fuel
* seller_type
* transmission
* Owner

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn import metrics

# Load Data

In [None]:
df = pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")

# Data PreProcessing

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
# DIstribution of Features.
df['Fuel_Type'].value_counts()

In [None]:
df['Seller_Type'].value_counts()

In [None]:
df['Transmission'].value_counts()

# Convert Categorical to Numeric (Encoding)

In [None]:
# Encoding "Fuel_Type"
df.replace({'Fuel_Type' : {'Petrol' : 0, 'Diesel' : 1, 'CNG': 2}}, inplace = True)

In [None]:
df.head()

In [None]:
# Encoding "Seller_Type"
df.replace({'Seller_Type' : {'Dealer' : 0, 'Individual' : 1}}, inplace = True)

In [None]:
# Encoding "Transmission"
df.replace({'Transmission' : {'Manual' : 0, 'Automatic' : 1}}, inplace = True)

In [None]:
df.head()

In [None]:
# Split into X and y
X = df.drop(['Car_Name', 'Selling_Price'], axis = 1) # axis = 1 is equivalent to axis = 'column'
y = df['Selling_Price']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 99)

# Model
## 1. Linear Regresion

In [None]:
model_lr = LinearRegression()

In [None]:
model_lr.fit(X_train, y_train)

# Model Evaluation

In [None]:
pred_lr = model_lr.predict(X_valid)

In [None]:
# R Squared Error
error_score_valid = metrics.r2_score(y_valid, model_lr.predict(X_valid))
error_score_train = metrics.r2_score(y_train, model_lr.predict(X_train))

error_score_train, error_score_valid

# Visualize actual price and predicted price

In [None]:
plt.scatter(y_valid, pred_lr)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Vs Predicted Car Price")
plt.show()

In [None]:
plt.scatter(y_valid, pred_lr, label = 'Valid')
plt.scatter(y_train, model_lr.predict(X_train), label = 'Train')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Vs Predicted Car Price")
plt.show()

## LASSO Regression

In [None]:
model_lasso = Lasso()

In [None]:
model_lasso.fit(X_train, y_train)

In [None]:
# R Squared Error
print(metrics.r2_score(y_valid, model_lasso.predict(X_valid)))
print(metrics.r2_score(y_train, model_lasso.predict(X_train)))

In [None]:
plt.scatter(y_valid, model_lasso.predict(X_valid))
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual Vs Predicted Car Price")
plt.show()