In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## This project is mainly focussed on Car Price Prediction.
## 1)Introduction
## 2)Loading Data, and importing Libraries.
## 3)Exploratory Data Analysis(EDA)
## 4)Applying Regression Models
## 5)Conclusion

# Introduction

#### CarDekho.com is India's leading car search venture that helps users buy cars that are right for them. Its website and app carry rich automotive content such as expert reviews, detailed specs and prices,  comparisons as well as videos and pictures of all car brands and models available in India. The company has tie-ups with many auto manufacturers, more than 4000 car dealers and numerous financial institutions to facilitate the purchase of vehicles.

#### In this dataset we have Car_Name,  Year, Kms_Driven, Selling_price, Present_price, Seller_Type, Transmission, and Owner.
#### We are going to predict the price of the Cars using Regression Models.
#### Let us extract vital information from data.

### Please do upvote if you like it. 

In [None]:
# Importing the required Libraries
import pandas as pd             # Data Processing (Ex: read, merge)  
import numpy as np              # For mathemetical calculations
import seaborn as sns           # For Data visulization
import matplotlib.pyplot as plt # For ploting the graphs

In [None]:
## Load the data
df=pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
df.head()  # To find top 5 columns from Data

In [None]:
df.info()  # To find the information

In [None]:
df.shape   # To check number of rows and columns

In [None]:
df.nunique()     # To see number of unique values(Features) in every individual columns

In [None]:
df.isna().sum()   # To check the missing values

In [None]:
df.describe().drop('count').T  # returns some common statistical details of the data

In [None]:
df1=df.copy()          # Copy the data into df1

In [None]:
df1['Car_Name'].nunique()  # There are 98 different Cars

In [None]:
# Seperate the categorical and numerical variable 
categorical_columns=df1.columns[df.dtypes=='object']
numerical_columns=df1.columns[df.dtypes!='object']
print(numerical_columns)
print(categorical_columns)

In [None]:
# Let us run through the categorical variable
print(df1['Seller_Type'].value_counts(), '\n') 
print(df1['Transmission'].value_counts(), '\n')
print(df1['Owner'].value_counts())

In [None]:
## It is observed that in 'Owner' column there are three categories 0,1,3. Lets replace 3 with 1 
df1['Owner']=df1['Owner'].replace(3, 1) 


In [None]:
print(df1['Owner'].value_counts())

In [None]:
# Let us find the Price difference between Present price and Selling price
df1['Price_diff']=df['Present_Price']-df['Selling_Price']

In [None]:
df1.head()

#### Data Visualization

In [None]:
sns.pairplot(df1)

In [None]:
# Visualization is the best way to understand and analyse the data
plt.figure(figsize=(13,7))
df1['Year'].value_counts().plot.bar()

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(x='Year',y='Selling_Price', data=df1)

In [None]:
# From the above graph it is observed that Car Selling was low in 2007 and high in 2018

#### Histograms represent the data distribution by forming bins along the range of the data and then drawing bars to show the number of observations that fall in each bin.

In [None]:
# The simple way is to use for loop to go through all the columns

In [None]:
# Plotting the histogram for numerical column
plt.figure(figsize=(20,20))
for i in range(len(numerical_columns)):
    plt.subplot(4,2,i+1)
    plt.hist(df1[numerical_columns[i]], bins=30)

In [None]:
# Plotting the distplot for numerical column
plt.figure(figsize=(20,20))
for i in range(len(numerical_columns)):
    plt.subplot(4,2,i+1)
    sns.distplot(df1[numerical_columns[i]], kde_kws = {'bw' : 1})

In [None]:
plt.figure(figsize=(13,7))
sns.barplot(x='Fuel_Type', y='Selling_Price', data=df1, palette = "gist_rainbow_r")

In [None]:
# From the above graph it is observed that Diesel cars were more expensive than Petrol and CNG

In [None]:
plt.figure(figsize=(13,7))
sns.barplot(x='Fuel_Type', y='Selling_Price',hue='Transmission', data=df1, palette = "BuGn")

In [None]:
# From the above graph it is observed that CNG does not have any Transmission i.e., Manual and Automatic
## Fuel_Type Diesel has more Automatic transmission than Petrol

In [None]:
plt.figure(figsize=(13,7))
sns.barplot(x='Fuel_Type', y='Selling_Price',hue='Owner', data=df1, palette = "nipy_spectral")

In [None]:
df1.corr()

In [None]:
# There is a correlation between Selling price and Present price. 

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(df1.corr(), annot=True, cmap='Blues')

In [None]:
df1=df1.drop(['Year', 'Car_Name'], 1)

In [None]:
df1.head()

In [None]:
df1=pd.get_dummies(df1, drop_first=True)

In [None]:
df1.head()

In [None]:
df1.corr()

In [None]:
# There is a strong correlation between Selling price and Fuel_Type_Diesel.
# There is a correlation between Price diff and Selling Price.
# There is a correlation between Price diff and Selling Price.


In [None]:
plt.figure(figsize=(13,7))
sns.heatmap(df1.corr(), annot=True)

In [None]:
X=df1.drop('Selling_Price',1)
y=df1['Selling_Price']

In [None]:
### Feature Importance

from sklearn.ensemble import ExtraTreesRegressor
import matplotlib.pyplot as plt
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.linear_model import LinearRegression
lr= LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
pred=lr.predict(X_test)

In [None]:
plt.scatter(y_test,pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor

#### This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

In [None]:
regressor=RandomForestRegressor()

In [None]:
regressor.fit(X_train, y_train)

In [None]:
pred=regressor.predict(X_test)

In [None]:
sns.distplot(y_test-pred)


In [None]:
plt.scatter(y_test,pred)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt= DecisionTreeRegressor()

In [None]:
dt.fit(X_train, y_train)

In [None]:
pred=regressor.predict(X_test)

In [None]:
plt.scatter(y_test, pred)

In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

#### MSE: the average squared difference between the estimated values and the actual value. MSE is a risk function, corresponding to the expected value of the squared error loss.

#### MAE: mean absolute error is a measure of errors between paired observations expressing the same phenomenon. 

#### RMSE: The root-mean-square deviation or root-mean-square error is a frequently used measure of the differences between values predicted by a model or an estimator and the values observed.

### Don't forget to Vote. Thank you. Please drop your suggestions if any.