# Task : Predict the price of the car(bmw). 

# Given: A csv file with all the require data so that a prediction can be made. 

# Hope you like the notebook. Please comment and upvote it if you like it.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# FOR EDA --------------- # 
import matplotlib.pyplot as plt 
%matplotlib inline 
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
# ----------------------- #

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv')

Let's start by reading the data and finding more about the data in the csv file. 

# Let's Begin.

In [None]:
display(data.info(), data.head())

### Here we have 9 columns and 10780 rows of data. and no null values currently. 

### But before making any prediction we have to do the following things --

### 1. Convert all the data into some numerical format for a proper evaluation. 

### 2. Take look on the data provided and categorise them as valuable or not for the prediction.


# Task 1: Encoding Data

## We will be encoding columns -- model,transmission and fuelType 

### But lets see what kind of data is there in these columns so as to decide which type of encoding will be better for the data. 

In [None]:
def target_count(data,column):
    trace = go.Bar( x = data[column].value_counts().values.tolist(),
    y = data[column].unique(),
    orientation = 'h',
    text = data[column].value_counts().values.tolist(),
    textfont=dict(size=20),
    textposition = 'auto',
    opacity = 0.5,marker=dict(colorsrc='tealrose',
            line=dict(color='#000000',width=1.5))
    )
    layout = (dict(title= "EDA of {} column".format(column),
                  autosize=True,height=800,))
    fig = dict(data = [trace], layout=layout)
    
    py.iplot(fig)

# --------------- donut chart to show there percentage -------------------- # 

def target_pie(data,column):
    trace = go.Pie(labels=data[column].unique(),values=data[column].value_counts(),
                  textfont=dict(size=15),
                   opacity = 0.5,marker=dict(
                   colorssrc='tealrose',line=dict(color='#000000', width=1.5)),
                   hole=0.6)
                  
    layout = dict(title="Dounat chart to see %age of individual elements")
    fig = dict(data=[trace],layout=layout)
    py.iplot(fig)

# 1. *Model* 

In [None]:
# Model 

target_count(data,'model')
target_pie(data,'model')


### Here we can see that all 'Series'(5,6,1,7,2,4) type model were sold more than other models followed by X type models.

# 2.*Transmition*

In [None]:
# Transmition

target_count(data,'transmission')
target_pie(data,'transmission')

### There are 3 type of Transmitions with most of the cars are automatic.

# 3. *fuelType*

In [None]:
# fuelType

target_count(data,'fuelType')
target_pie(data,'fuelType')

### Most of the cars are of Diesel fuelType followed by Petrol fuelType rest make a very small portion. 

Seeing the type of data I am going to use label encoding for encoding the data 

## Label-Encoding

In [None]:
#df['model','transmission','fuelType'] = data['model','transmission','fuelType']
#for feat in ['model','transmission','fuelType']:

from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
## model_LE
data["LE_model"] = lb_make.fit_transform(data["model"])

## model_LE
data["LE_transmission"] = lb_make.fit_transform(data["transmission"])

## model_LE
data["LE_fuelType"] = lb_make.fit_transform(data["fuelType"])

## results
data[["model","LE_model","transmission","LE_transmission","fuelType","LE_fuelType"]].head(11)

### Here we can easyly see that our data has been Label-Encoded we will be droping the previous columns later after the EDA. 

# Feature Engineering

# Re-Checking few columns for data-discrepancies

In [None]:
print(sum(data['engineSize'] == 0))
print(sum(data['tax'] == 0))

## Though no null value is present however some data discrepancies can be seen 
## engineSize and tax cannot be 0. Thus we are going to consider them null values and find suitable values to fill in

In [None]:
# We have simply replaced 0 values with null.
data[["engineSize","tax"]] = data[["engineSize","tax"]].replace(0,np.NaN)
data.isnull().sum()

### Now lets find what values can be filled in place of null.<br>
### Well we can assume that both tax and engineSize depends on the model of the car so we are going to group them and replace the null values with the median of tax and size of that model.

In [None]:
def find_median(var):
    temp = data[data[var].notnull()]
    temp = data[[var,'model']].groupby('model')[[var]].median().reset_index()
    return temp

In [None]:
# model filling

find_median('tax')

## Currently I think its better to drop these rows as out of 10,000+ dataset few 100 rows won't matter. 
## Note: I am making an expection of this dataset only...

In [None]:
data = data.dropna()
data = data.reset_index(drop=True)

In [None]:
display(data.info)

## Now that we are finished with preparing data now we can finally start finding correlation.

# Heatmap(correlation)

In [None]:
def correlation_plot():
    #correlation
    correlation = data.corr()
    #label 
    matrix_cols = correlation.columns.tolist()
    #convert to array as it can't take values directly. 
    corr_array = np.array(correlation)
    trace = go.Heatmap(z = corr_array,
                      x=matrix_cols,
                      y=matrix_cols,
                      colorscale='Viridis',
                      colorbar = dict()
                      )
    layout = go.Layout(dict(title='Correlation Matrix for variables provided.',
                          margin = dict(r=0,l=100,
                                       t = 0, b =100,),
                          yaxis = dict(tickfont = dict(size = 9)),
                          xaxis = dict(tickfont = dict(size = 9)),
                          )
                      )
    fig = go.Figure(data = [trace], layout = layout)
    py.iplot(fig)

In [None]:
## So let's start  by finding the correlation between the columns. 

correlation_plot()


From the above correlation map the following information can be derived. 

1. Price & Year have a direct connection -- maybe the older car the less price --> 
2. Price & EngineSize are also correlated. -- maybe bigger the engine more the price -->
3. Price & LE_model are also correlated. -- better model more the price -->
4. Price & tax are also have some correlation -- less price thus lower tax has to be paid.-->

5. engineSize & tax are also correlated and indirectly affect the price.

In the following process we will try to find more about it..

 # Prediction model

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

reg = linear_model.LinearRegression()
X = data[['LE_model', 'LE_transmission', 'LE_fuelType', 'engineSize','year', 'tax','mileage','mpg']]
Y = data['price']
train_X,test_X,train_Y,test_Y = train_test_split(X,Y,test_size=0.2)
reg.fit(train_X,train_Y)
print('Performance Score(GB): %.1f ' %(reg.score(test_X,test_Y)*100))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GB=GradientBoostingRegressor(random_state=0)
GB.fit(train_X,train_Y)
print('Performance Score(GB): %.1f ' %(GB.score(test_X,test_Y)*100))

In [None]:
from xgboost import XGBRegressor
XGB=XGBRegressor(random_state=0)
XGB.fit(train_X,train_Y)
print('Performance score(XGB): %.1f ' %(XGB.score(test_X,test_Y)*100))

# Finally we have reached the end!!! 
# So I conclude that there are many more ways to predict the price of the BMW car price, meanwhile I have found that XGBoost was the most accurate predictor with 95.6% validation accuracy. 

# Thank You for going through the notebook. Hope you found it interesting and helpful. 
# Please comment and upvote this notebook. Thank you again..