In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Getting Started**

In [None]:
#Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics

In [None]:
data = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")
data.head()

# **Exploratory data analysis**

In [None]:
#Shape of data 
print(data.shape)
#dtypes of data 
print(data.dtypes)

In [None]:
# Info of data
data.info()

In [None]:
# value_counts
data["Owner"].value_counts()

In [None]:
# checking the number of missing values
data.isnull().sum()

In [None]:
# checking the distribution of categorical data
print(data.Fuel_Type.value_counts())
print(data.Seller_Type.value_counts())
print(data.Transmission.value_counts())

# **Data Transformation**

In [None]:
# encoding "Fuel_Type" Column
data.replace({'Fuel_Type':{'Petrol':0,'Diesel':1,'CNG':2}},inplace=True)

# encoding "Seller_Type" Column
data.replace({'Seller_Type':{'Dealer':0,'Individual':1}},inplace=True)

# encoding "Transmission" Column
data.replace({'Transmission':{'Manual':0,'Automatic':1}},inplace=True)

In [None]:
data.head()

# **Visualization for dataset**

In [None]:
# number of values for each quality
sns.catplot(x='Selling_Price', data = data, kind = 'count')

In [None]:
# volatile acidity vs Quality
plot = plt.figure(figsize=(5,5))
sns.barplot(x='Selling_Price', y = 'Owner', data = data)

In [None]:
# describe data
data.describe()

# **Checking for outliers**

In [None]:
#Checking for outlier 
fig, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(data = data, width = 0.5, ax = ax, fliersize = 3)
plt.show()

In [None]:
# Removing Outliers 
outlier = data['Kms_Driven'].quantile(0.95)
# removing the top 5% data from Insulin column
data = data[data['Kms_Driven']<outlier]

In [None]:
# Checking again for outliers
fig, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(data = data, width = 0.5, ax = ax, fliersize = 3)
plt.show()

# **Model Preparation**

In [None]:
X = data.drop(['Car_Name','Selling_Price'],axis=1)
y = data['Selling_Price']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
print(X)
print(y)

In [None]:
# train_test_spilt
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
print("The shape of X_train is", X_train.shape )
print("The shape of X_test is", X_test.shape)
print("The shape of y_train is", y_train.shape)
print("The shape of y_test is", y_test.shape)

# **Model Training**

We train different model after the evaluation of model we will select out best model for production.

1.   Linear Regression
2.   Polynomial Regression
3.   Random Forest Regressor
4.   XgBoost
5.   SVM
6.   Lasso Regression

# **Linear Regression**

In [None]:
# loading the linear regression model
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train,y_train)

# **Model Evaluation of Linear Regression**

### **Traning Data**

In [None]:
# prediction on Training data
training_data_prediction = lin_reg_model.predict(X_train)
# R squared Error
error_score = metrics.r2_score(y_train, training_data_prediction)
print("R squared Error of train : ", error_score)

In [None]:
#Visualize the actual prices and Predicted prices of traing set
plt.scatter(y_train, training_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title(" Actual Prices vs Predicted Prices")
plt.show()

In [None]:
# Checking Normality of errors
sns.distplot(y_train-training_data_prediction)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

### Here the residuals are normally distributed. So normality assumption is satisfied.

## **Test Data**

In [None]:
# prediction on testing data
test_data_prediction = lin_reg_model.predict(X_test)
# R squared Error
error_score = metrics.r2_score(y_test, test_data_prediction)
print("R squared Error Testing Data : ", error_score)

In [None]:
#Visualize the actual prices and Predicted prices of testing set
plt.scatter(y_test, test_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title(" Actual Prices vs Predicted Prices")
plt.show()

In [None]:
# Checking Normality of errors
sns.distplot(y_test-test_data_prediction)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

# **Polynomial Regression**

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# fit into Polynomial regression
poly = PolynomialFeatures(degree = 2)
x_poly = poly.fit_transform(X_train)
poly.fit(x_poly,y_train)
# fit into linear regression
model_polynomial = LinearRegression()
model_polynomial.fit(x_poly,y_train)

# **Model Evaluation of Polynomial Regression**

### **Traning Data**

In [None]:
x_traing_poly_data = model_polynomial.predict(poly.fit_transform(X_train))
# R squared Error
error_score_poly_train = metrics.r2_score(y_train, x_traing_poly_data)
print("R squared Error Traning Data : ", error_score_poly_train)

# **Random Forest Regressor**

In [None]:
# Import Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model_reg = RandomForestRegressor()
model_reg.fit(X_train, y_train)

# **Model Evaluation of Random Forest Regression**
## **Traning Data**

In [None]:
y_random_forest_pred = model_reg.predict(X_train)
# Model Evaluation
print('R^2:',metrics.r2_score(y_train, y_random_forest_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_random_forest_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_random_forest_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_random_forest_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_random_forest_pred)))

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_train, y_random_forest_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

In [None]:
# Predicting Test data with the model
y_test_pred = model_reg.predict(X_test)
# Model Evaluation
acc_rf = metrics.r2_score(y_test, y_test_pred)
print('R^2:', acc_rf)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_test, y_test_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

# **Lasso Regression**

In [None]:
# loading the linear regression model
lass_reg_model = Lasso()
lass_reg_model.fit(X_train,y_train)

In [None]:
# prediction on Training data
training_data_prediction = lass_reg_model.predict(X_train)
# R squared Error
error_score = metrics.r2_score(y_train, training_data_prediction)
print("R squared Error : ", error_score)

In [None]:
plt.scatter(y_train, training_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title(" Actual Prices vs Predicted Prices")
plt.show()

In [None]:
# prediction on Testing data
test_data_prediction = lass_reg_model.predict(X_test)
# R squared Error
error_score = metrics.r2_score(y_test, test_data_prediction)
print("R squared Error : ", error_score)

In [None]:
plt.scatter(y_test, test_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title(" Actual Prices vs Predicted Prices")
plt.show()