# **Sri Lanka Vehicle Prices Dataset**
<img src = 'https://www.thecoolector.com/wp-content/uploads/2018/10/charger-1050x700.jpg'>



## **Contents in this Data Analysis**

1.   Data Loading
2.   Data Cleaning
3.   EDA and Data Transformation
4.   Dashboard (Get Car of Your Own Choice using Filters)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import timeit
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

In [None]:
df = pd.read_csv(r"../input/sri-lanka-vehicle-prices-dataset/vehicle_data.csv")
df.head()

In [None]:
df.info()

In [None]:
#Data Transformation Needed to Convert Object Columns into Numerical Columns
%timeit df['Price'] = df['Price'].str.replace("Rs","")
%timeit df['Price'] = df['Price'].apply(lambda a: str(a).replace("Rs",""))

In [None]:
#For Price Column
df['Price'] = df['Price'].apply(lambda a: str(a).replace("Rs",""))
df['Price'] = df['Price'].apply(lambda a: str(a).replace(",",""))
df['Price'] = df['Price'].astype(float)

#For Capacity
df['Capacity'] = df['Capacity'].apply(lambda a: str(a).replace("cc",""))
df['Capacity'] = df['Capacity'].apply(lambda a: str(a).replace(",",""))
df['Capacity'] = df['Capacity'].astype(int)

#For Mileage
df['Mileage'] = df['Mileage'].apply(lambda a: str(a).replace("km",""))
df['Mileage'] = df['Mileage'].apply(lambda a: str(a).replace(",",""))
df['Mileage'] = df['Mileage'].astype(int)

#Cleaned Dataset with Renamed Column
df = df.rename(columns={'Price': 'Price_rs','Capacity': 'Capacity_cc','Mileage': 'Mileage_km'})
df.head()

In [None]:
df.describe()

In [None]:
#Getting only those Columns that we need, so Drop that we don't need
data = df.drop(['Sub_title','Edition'],axis=1)

In [None]:
data[['Brand','Model']]
data['Brand_Model'] = data['Brand'] + " " + data['Model']
data = data.drop(['Brand','Model'],axis=1)

In [None]:
data.isna().sum()

In [None]:
#Dataset Cleaned for all nan Values and Replaced with MODE
data['Body'] = data['Body'].fillna(data['Body'].mode()[0])

In [None]:
data.head(1)

In [None]:
#Unique Values in all Categorical Variables
print(data['Brand_Model'].unique()) # Build using DASH uploaded Soon # We can build model that predict Top 10 Brand and Model on the Basis of Specifications Given 
print("*"*50)
print(data['Condition'].unique()) # Get to know the Price Distribution in different conditions and impact on Price
print("*"*50)
print(data['Transmission'].unique()) # Get to know which Transmission is used more and Why and impact on Price
print("*"*50)
print(data['Body'].unique()) # Get to know how Body Type impact on Price and which body type is used latest and which is old
print("*"*50)
print(data['Fuel'].unique()) # Which Fuel gives more mileage
print("*"*50)
print(data['Seller_name'].unique()) # which seller is most successfull or having more cars
print("*"*50)
print(data['Seller_type'].unique()) # We can build that predicts what Seller Membership is given to Seller on the basis of this Data

# **Car BRAND MODEL Prediction**

In [None]:
car = data[['Brand_Model','Price_rs','Year','Condition','Transmission','Body','Fuel','Capacity_cc','Mileage_km','Seller_name','Seller_type']]

In [None]:
car['serial'] = car.index
car = car[['serial','Brand_Model','Price_rs','Year','Condition','Transmission','Body','Fuel','Capacity_cc','Mileage_km','Seller_name','Seller_type']]
car.head(1)

In [None]:
car['Year'] = pd.to_datetime(car['Year'].astype(str)).values
car.head(1)

In [None]:
#Getting count of Used, New and Re-Conditioned Cars
print("New Cars :",len(car[car['Condition'] == 'New']))
print("Reconditioned Cars :",len(car[car['Condition'] == 'Reconditioned']))
print("Used Cars :",len(car[car['Condition'] == 'Used']))

In [None]:
#Price of Cars Brand Model Scatter Plot with Conditions
fig1 = px.scatter(car, x=car['Brand_Model'], y=car['Price_rs'], size='Price_rs',color='Condition')
fig1

In [None]:
def carPriceData(data):
    rating = data.groupby(['Price_rs', 'Brand_Model','Condition']).agg({'serial': 'count'}).reset_index()
    rating = rating[rating['serial'] != 0]
    rating.columns = ['Price_rs', 'Brand_Model', 'Condition','count']
    rating = rating.sort_values('Price_rs',ascending=False)
    return rating

In [None]:
carPrice = carPriceData(car)
fig = px.bar(carPrice[:20], x='Brand_Model', y='Price_rs', color='Condition')
fig.show()

In [None]:
# Transmission check in All Conditions
def carConditionData(data):
    rating = data.groupby(['Brand_Model', 'Condition','Transmission']).agg({'serial': 'count'}).reset_index()
    rating = rating[rating['serial'] != 0]
    rating.columns = ['Brand_Model', 'Condition','Transmission','count']
    rating = rating.sort_values('count',ascending=False)
    return rating

car_new = car[car['Condition'] == 'New']
car_re = car[car['Condition'] == 'Reconditioned']
car_use = car[car['Condition'] == 'Used']

car_newdf = carConditionData(car_new)
car_redf = carConditionData(car_re)
car_usedf = carConditionData(car_use)

In [None]:
  fig = make_subplots(rows=1, cols=3, specs=[[{"type": "pie"}, {"type": "pie"}, {"type": "pie"}]])

  fig.add_trace(
      go.Pie(labels=car_newdf['Transmission'], values=car_newdf['count']),
      row=1, col=1
  )

  fig.add_trace(
      go.Pie(labels=car_redf['Transmission'], values=car_redf['count']),
      row=1, col=2
  )

  fig.add_trace(
      go.Pie(labels=car_usedf['Transmission'], values=car_usedf['count']),
      row=1, col=3
  )

  fig.update_traces(textposition='outside', hole=.4, hoverinfo="label+percent")
  fig.update_layout(
      title_text="Transmission in Each Conditioned DataFrame",
      # Add annotations in the center of the donut pies.
      annotations=[dict(text='New', x=0.11, y=0.5, font_size=12, showarrow=False),
                  dict(text='Reconditioned', x=0.50, y=0.5, font_size=12, showarrow=False),
                  dict(text='Used', x=0.89, y=0.5, font_size=12, showarrow=False)])
  fig.show()

In [None]:
#WordCloud of Brand_Models to See which has more impact
from wordcloud import WordCloud

text = str(list(car['Brand_Model'])).replace(',', '').replace('[', '').replace("'", '').replace(']', '')

plt.rcParams['figure.figsize'] = (15, 15)
wordcloud = WordCloud(background_color = 'white', width = 1200,  height = 1200, max_words = 121).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


In [None]:
# Model mileage vs Price Details on the Basis of Seller Type
def carSelect(carModel,seller):
  selected = car[car['Brand_Model'] == carModel]
  selected = selected[selected['Seller_type'] == seller]
  selected = selected.sort_values(by=['Mileage_km'], ascending=False)
  return selected

In [None]:
carSelected = carSelect('Land Rover Range Rover','Member')
carSelected.head(5)

In [None]:
fig = px.histogram(carSelected, x='Mileage_km', y='Brand_Model', color='Condition')
fig.show()

In [None]:
#Best Seller 

data2 = car[['Price_rs','Seller_name']].groupby('Seller_name').sum()

In [None]:
data2 = pd.DataFrame(data2.to_records()) # Multi index to Single Index

In [None]:
data2 = data2.sort_values(by=['Price_rs'], ascending=False)

In [None]:
data2 = data2[:10]
data2

In [None]:
px.bar(data2,x = data2['Seller_name'], y=data2['Price_rs'],color='Price_rs')

## **Get to know the Price Distribution in different conditions and impact on Price**

In [None]:
data.head(1)

In [None]:
cond = data[['Brand_Model','Price_rs','Year','Condition','Mileage_km']]

In [None]:
cond.head(1)

In [None]:
def conditionPriceCompare(model):
  result = []
  #New
  data_new = cond[(cond['Brand_Model'] == model) & (car['Condition'] == 'New')]
  data_new = data_new.sort_values(by='Mileage_km',ascending=False)
  if len(data_new) > 0:
    result.append(data_new[:1].values[0])
  else:
    print("Car in this Condition Not Available")
    result.append([model, 0, 0, 'New', 0])
  #Reconditioned 
  data_recond = cond[(cond['Brand_Model'] == model) & (car['Condition'] == 'Reconditioned')]
  data_recond = data_recond.sort_values(by='Mileage_km',ascending=False)
  if len(data_recond) > 0:
    result.append(data_recond[:1].values[0])
  else:
    print("Car in this Condition Not Available")
    result.append([model, 0, 0, 'Reconditioned', 0])
  #Used
  data_used = cond[(cond['Brand_Model'] == model) & (car['Condition'] == 'Used')]
  data_used = data_used.sort_values(by='Mileage_km',ascending=False)
  if len(data_used) > 0:
    result.append(data_used[:1].values[0])
  else:
    print("Car in this Condition Not Available")
    result.append([model, 0, 0, 'Used', 0])
  
  return result

In [None]:
res = conditionPriceCompare('Nissan Dayz')
df_cond = pd.DataFrame(res,columns =['Brand_Model', 'Price_rs','Year','Condition','Mileage_km'])
df_cond

In [None]:
px.bar(df_cond,x = df_cond['Mileage_km'], y=df_cond['Price_rs'],color='Condition')

In [None]:
car.head(1)

In [None]:
#Which Seller has Most New, Used and Reconditioned Car
new = car[car['Condition'] == 'New']
new_2 = new[['Condition','Seller_name']].groupby('Seller_name').count()
new_2 = pd.DataFrame(new_2.to_records())
new_cars_seller = new_2.sort_values(by='Condition',ascending=False).values[:5]
newdf = pd.DataFrame(new_cars_seller,columns = ['Seller_name','Count'])
newdf['Condition'] = 'New'
newdf

In [None]:
old = car[car['Condition'] == 'Used']
old_2 = old[['Condition','Seller_name']].groupby('Seller_name').count()
old_2 = pd.DataFrame(old_2.to_records())
old_cars_seller = old_2.sort_values(by='Condition',ascending=False).values[:5]
olddf = pd.DataFrame(old_cars_seller,columns = ['Seller_name','Count'])
olddf['Condition'] = 'Used'
olddf

In [None]:
recon = car[car['Condition'] == 'Reconditioned']
recon = recon[['Condition','Seller_name']].groupby('Seller_name').count()
recon = pd.DataFrame(recon.to_records())
recon_cars_seller = recon.sort_values(by='Condition',ascending=False).values[:5]
recondf = pd.DataFrame(recon_cars_seller,columns = ['Seller_name','Count'])
recondf['Condition'] = 'Reconditioned'
recondf

In [None]:
seller_con = pd.concat([newdf, olddf,recondf]).reset_index(drop=True)
seller_con.info()

In [None]:
seller_con['Count'] = seller_con['Count'].astype(int)

In [None]:
px.bar(seller_con, x=seller_con['Seller_name'], y=seller_con['Count'],color='Condition')

## **Transmission and Body Based Analysis**

In [None]:
tran = data[['Brand_Model','Condition','Transmission','Body','Mileage_km','Price_rs']]
tran.head(1)

In [None]:
def TranModel(trantype, bodytype):
    data_new = tran[(tran['Transmission'] == trantype) & (tran['Body'] == bodytype)]
    data_new = data_new.sort_values(by='Mileage_km',ascending=False)
    return data_new

In [None]:
t = TranModel('Automatic','Hatchback')
t = t[:10]

In [None]:
px.bar(t, x=t['Brand_Model'], y=t['Price_rs'],color='Condition')

In [None]:
px.histogram(tran, x=tran['Transmission'],color='Condition')

In [None]:
px.histogram(tran, x=tran['Body'],color='Condition')

# For DASHBoard using DASH and Python "Comment Down"
## FOLLOW ME ON GITHUB - SIMRANJEET97 (https://github.com/simranjeet97)
## YOUTUBE - FREEBIRDS CREW (https://www.youtube.com/channel/UC4RZP6hNT5gMlWCm0NDzUWg?view_as=subscriber?sub_confirmation=1)