In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # Setting a default seaborn setting in plots
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**1. Introduction: (#1)**

**2. Importing the Dataset and Explanation of Features: (#2)**

**3. Visualization of features for better understanding: (#3)**

**4. Build ML Regression Models: (#4)**

**5. CONCLUSION: (#5)**

**1) Discription** of Data Set:

This dataset contains information about used cars listed on www.cardekho.com
This data can be used for a lot of purposes such as price prediction to exemplify the use of linear regression in Machine Learning.
The columns in the given dataset are as follows:

1. name : Car_Name(name of the car)
2. year : Year(built year)
3. selling_price : Selling_Price(Current selling price of car)
4. Present_Price : Present_Price(Current price of car)
5. km_driven : Kms_Driven(Total km drived)
6. fuel  : Fuel_Type(Fuel Type)
7. seller_type : Seller_Type(Type of seller)
8. transmission : Transmission
9. Owner : (Dependent variable)

						

# 2) Importing the Dataset

In [None]:
#Importing the Dataset
df=pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')
df.head()

In [None]:
#Shape of the overall dataset(row,columns)
df.shape

In [None]:
# Let's get some information from dataset

df.info()

In [None]:
#Checking weither the dataset contains any null values

df.isnull().mean()

# Statistical Summary

In [None]:

df.describe(include='all')

In [None]:
df.describe(include='object')

**We can see that Fuel_Type have three categories ,Seller_Type as two categories and Transmission have only two categories**

In [None]:
#Let's check some unique values in data set using unique()

print(df['Fuel_Type'].unique())
print(df['Seller_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())

# 3) Visualization of features** for better understanding

In [None]:
df.columns

In [None]:
#Distribution plot of Year

sns.distplot(df.Year)

In [None]:
# Exploring PDF(probability density function) of features
fig = df.hist(figsize=(18,18))

# **1) Seller Type**

In [None]:
sns.barplot('Seller_Type','Selling_Price',data=df,palette='twilight')

**Selling Price of cars seems to have higher prices when sold by Dealers when compared to Individuals.**


# 2) Fuel Type # 

In [None]:
sns.barplot('Fuel_Type','Selling_Price',data=df,palette='twilight')

****Selling Price of cars with Fuel Type of Diesel is higher than Petrol and CNG.****

# 3) Present_Price


In [None]:
sns.regplot('Selling_Price','Present_Price',data=df)

**Here we can see that Selling Price tends to increase with increase in the Present Price of cars.**

# 4) Kms Driven

In [None]:
sns.regplot('Selling_Price','Kms_Driven',data=df)

# 5) Transmission

In [None]:
sns.barplot('Transmission','Selling_Price',data=df,palette='spring')

**We can observed that Selling Price would be higher for cars that are Automatic.**


# 6) Owner

In [None]:
sns.barplot('Owner','Selling_Price',data=df,palette='ocean')

**We can observed that because of less Owner used Selling price is higher.**

In [None]:
plt.figure(figsize=(15,8))
df.boxplot()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(df['Year'],df['Selling_Price'])

# Plot categorical features

In [None]:
# Creating a function to make a countplot

def plot_categorical(feature , dataset):
    ax = sns.countplot(y=feature, data=dataset)
    plt.title('Distribution of ' + feature)
    plt.xlabel('Count')

    total = len(dataset[feature])
    for p in ax.patches:
            percentage = '{:.1f}%'.format(100 * p.get_width()/total)
            x = p.get_x() + p.get_width() + 0.02
            y = p.get_y() + p.get_height()/2
            ax.annotate(percentage, (x, y))

    plt.show()

In [None]:
plot_categorical('Fuel_Type' , df)

**79.4% conatins petrol fueltype **

In [None]:
plot_categorical('Seller_Type' , df)

**64.8% car are selling using dealer**

In [None]:
plot_categorical('Transmission' , df)

In [None]:
#Sellection of our final dataset

final_set=df[[ 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven','Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

#We have droped Car_Name from dataset

In [None]:
#It's important to know how many years old the car is.

final_set['Current_Year']=2020  #Adding the Current_Year in dataset

final_set['No_of_total_years']=final_set['Current_Year']-final_set['Year'] 

final_set.head()

In [None]:
# It's time to drop the Year column after the needed info is derived.

final_set.drop(['Year','Current_Year'],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot('No_of_total_years','Selling_Price',data=final_set)

In [None]:
final_set.head()

In [None]:
sns.pairplot(final_set)

# Converting Categorical variable into numerical 

**Here we are using get dummies method to whole dataset.**

In [None]:
final_set=pd.get_dummies(final_set,drop_first=True) #drop_first drops the first feature 
final_set.head()

#  Correlation Matrix

Call the corr() function which will return the correlation matrix of numeric variables

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(final_set.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)
plt.show()

# Split data into train and test

In [None]:
X = final_set.drop(['Selling_Price'] , axis = 1)
y = final_set['Selling_Price']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
print("X train: ",X_train.shape)
print("X test: ",X_test.shape)
print("Y train: ",y_train.shape)
print("Y test: ",y_test.shape)

# 4) Build ML Regression Models

In [None]:
#Thanks to : https://www.kaggle.com/cagkanbay/car-price-prediction

r_2 = [] # List for r 2 score
MSE = [] # list for mean_squared_error scores mean

# Main function for models
def model(algorithm,X_train,y_train,X_test,y_test): 
    algorithm.fit(X_train,y_train)
    predicts=algorithm.predict(X_test)
    prediction=pd.DataFrame(predicts)
    R_2=r2_score(y_test,prediction)
    error=mean_squared_error(y_test,prediction)
    
    
    # Appending results to Lists 
    r_2.append(R_2)
    MSE.append(error)
    
    # Printing results  
    print(algorithm,"\n") 
    print("r_2 score :",R_2,"\n")
    print("MSE:",error)
    
    # Plot for prediction vs originals
    test_index=y_test.reset_index()["Selling_Price"]
    ax=test_index.plot(label="originals",figsize=(12,6),linewidth=2,color="r")
    ax=prediction[0].plot(label = "predictions",figsize=(12,6),linewidth=2,color="g")
    plt.legend(loc='upper right')
    plt.title("ORIGINALS VS PREDICTIONS")
    plt.xlabel("index")
    plt.ylabel("values")
    plt.show()

# 1. Linear Regression

In [None]:
lr = LinearRegression()
model(lr,X_train,y_train,X_test,y_test)

# 2. Ridge Regression

In [None]:
alpha=[0.001,0.1,1,10,100,1000]
normalize=['True',"False"]

parameters={
    'alpha':alpha,
    'normalize':normalize
}

rv_rid=RandomizedSearchCV(Ridge(),parameters,cv=6,
                       n_iter=10,scoring='neg_mean_squared_error',random_state=5,n_jobs=1)
rv_rid.fit(X_train,y_train)


print(rv_rid.best_estimator_)

In [None]:
ridge = Ridge(alpha = 0.01, normalize = True) # applied the best estimator
model(ridge,X_train,y_train,X_test,y_test)

# 3. Lasso


In [None]:
parameters={
    'alpha':np.logspace(-3,3,num=14)   # range for alpha
}

rv_rid=RandomizedSearchCV(Lasso(),parameters,cv=6,
                       n_iter=10,scoring='neg_mean_squared_error',random_state=5,n_jobs=1)
rv_rid.fit(X_train,y_train)


print(rv_rid.best_estimator_)

In [None]:
ls = Lasso(alpha = rv_rid.best_estimator_.alpha, normalize = True) # applied the best estimator
model(ls,X_train,y_train,X_test,y_test)

# 4. Random Forest Regressor

In [None]:
#Randomized Search CV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
rf_para = {
    "n_estimators":n_estimators,
    "max_depth":max_depth,
    "min_samples_split":min_samples_split,
    "min_samples_leaf":min_samples_leaf,
    "max_features":max_features    
}

In [None]:
rf_reg=RandomForestRegressor()
rf_rand=RandomizedSearchCV(estimator=rf_reg,param_distributions=rf_para,cv=6,
                       n_iter=10,scoring='neg_mean_squared_error',random_state=5,n_jobs=1)

rf_rand.fit(X_train,y_train)


In [None]:
print(rf_rand.best_estimator_)

In [None]:
rf = RandomForestRegressor(max_depth=25, min_samples_leaf=2, n_estimators=300, random_state = 42)
model(rf,X_train,y_train,X_test,y_test)

# 5. Decision Tree Regressor

In [None]:
dtr = DecisionTreeRegressor()
model(dtr,X_train,y_train,X_test,y_test)

In [None]:
Model = ["LinearRegression","Ridge","Lasso","RandomForestRegressor","DecisionTreeRegressor"]
results=pd.DataFrame({'Model': Model,'R Squared': r_2,'MSE': MSE})
results

# 5) Conclusion. 
 
**All the Models performed well but the high score is of RandomForestRegressor.BY using RandomForestRegressor we can imporve Selling_Price prediction more accurately.**

**Thankyou!!!**