![vehicle_image](https://www.thebangoraye.com/wp-content/uploads/2017/09/bcfc-15.jpg)

# **1- Importing required libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **2- Reading file**

In [None]:
vehicle_data = pd.read_csv("/kaggle/input/sri-lanka-vehicle-prices-dataset/vehicle_data.csv")

# **3- Display top 5 records**

In [None]:
vehicle_data.head()

# **4- Meta information about dataframe**

## **4.1- Shape of dataframe**

In [None]:
vehicle_data.shape

## **4.2- Datatype of Each Column**

In [None]:
vehicle_data.dtypes

# **5. Data Cleaning**

## **5.1- Finding NaN values**

In [None]:
plt.figure(figsize = (10,6))
plt.title("Missing values in Each Column\n", size = 15)
sns.heatmap(vehicle_data.isnull(), yticklabels=False, cbar=False);



## **Edition** and **Body** has missing values

In [None]:
for col in ['Edition', 'Body']:
    missing_percent = np.round((vehicle_data[col].isnull().sum() * 100) / vehicle_data.shape[0], 3)
    print(f'{missing_percent} % data is miss in `{col}` column')

 ## **5.2- Imputing missing values using mode**

In [None]:
vehicle_data['Edition'].fillna(vehicle_data['Edition'].mode()[0], inplace = True)
vehicle_data['Body'].fillna(vehicle_data['Body'].mode()[0], inplace = True)

In [None]:
vehicle_data.isnull().sum()

#### **All missing values are fixed**

## **5.3- Convert dtype of Price feature**

In [None]:
vehicle_data.rename({"Price":"Price (Rs)"}, axis = 1, inplace = True)

In [None]:
vehicle_data['Price (Rs)'] = vehicle_data['Price (Rs)'].str.replace('Rs', '').str.replace(',', '').astype(float)

## **5.4- Convert dtype of Capacity**

In [None]:
vehicle_data.rename({"Capacity" : "Capacity(cc)"}, axis = 1, inplace = True)

In [None]:
vehicle_data['Capacity(cc)'] = vehicle_data['Capacity(cc)'].str.replace('cc', '').str.replace(',', '').astype('int64')

## **5.5- Convert dtype of Milage**

In [None]:
vehicle_data.rename({"Mileage" : "Mileage(km)"}, axis = 1, inplace = True)

In [None]:
vehicle_data['Mileage(km)'] = vehicle_data['Mileage(km)'].str.replace('km', '').str.replace(',', '').astype('int64')

In [None]:
vehicle_data.head()

## **5.6- Shorten title**

In [None]:
vehicle_data['Title'] = vehicle_data['Brand'] + " "+ vehicle_data['Model']

## **5.7- Convert published_date feature to datetime**

In [None]:
vehicle_data['published_date'] = pd.to_datetime(vehicle_data['published_date'])

## **5.8- Seperate `year`, `month`, `day` from `published_date`**

In [None]:
# for year
vehicle_data['published_year'] = pd.DatetimeIndex(vehicle_data['published_date']).year

# for month
vehicle_data['published_month'] = pd.DatetimeIndex(vehicle_data['published_date']).month

# for day
vehicle_data['published_day'] = pd.DatetimeIndex(vehicle_data['published_date']).day

In [None]:
vehicle_data.columns

## **5.9- Dropping unnecessary features**

In [None]:
vehicle_data.drop(["Sub_title",'Description', 'published_date'], axis = 1, inplace = True)

# **Checking statistical description of data**

In [None]:
vehicle_data.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

**Price(Rs) has highest standard deviation among other features**



In [None]:
vehicle_data.head()

# **Declaring a function that convert links to clickable**

In [None]:
def clickable(path):        
    final_url = os.path.basename(path)
      
    # convert the url into link
    return f'<a href="{path}">{final_url}</a>'

# **6- Visualization**

## **6.1- Analyze top 20 vehicle with highest Price**

In [None]:
top_highest_price = vehicle_data.sort_values("Price (Rs)", ascending = False).reset_index().head(20)

In [None]:
px.scatter(top_highest_price, x = "Title", y = "Price (Rs)",
           size = "Price (Rs)",
           hover_data=['Condition', 'Transmission', 'Fuel','Price (Rs)'],
           color = "Transmission")

**`Mitsubishi Montero` is more expensive then other vehicle (I consider it as outlier). Also from top 20 expensive vehicles most of them are `Land Rover Range Rover`**

In [None]:
top_highest_price[['Title', 'Price (Rs)','Capacity(cc)', 'Post_URL']].style.format({'Post_URL' : clickable})

### **Click on the link to go to original post**

## **6.2- Analyze top 20 vehicle with lowest Price**

In [None]:
lowest_price_vehicle = vehicle_data.sort_values("Price (Rs)", ascending = True).reset_index().head(20)

In [None]:
px.scatter(lowest_price_vehicle, x = "Title", y = "Price (Rs)",
           size = "Price (Rs)",
           hover_data=['Condition', 'Transmission', 'Fuel','Price (Rs)'],
           color = "Transmission")

In [None]:
lowest_price_vehicle[['Title', 'Price (Rs)','Capacity(cc)', 'Post_URL']].style.format({'Post_URL' : clickable})

### **Click on the link to go to original post**

## **6.3- Analyze vehicle_data, Condition wise** 

In [None]:
px.histogram(vehicle_data, x='Condition', barmode='group', color="Transmission",)

**So most of the vehicles are in Used condition and Transmission type is Automatic** 

## **6.4- Analyze data by Fuel type**

In [None]:
px.histogram(vehicle_data, x='Fuel', barmode='group', color="Transmission",)

## **6.5- Analyze Year and published_year**

In [None]:
desending_price = vehicle_data.sort_values("Price (Rs)", ascending=False).reset_index().drop('index', axis = 1)

In [None]:
ascending_price = vehicle_data.sort_values("Price (Rs)", ascending=True).reset_index().drop('index', axis = 1)

In [None]:
plt.figure(figsize=(20,8))

plt.title("Difference in Manufactured Year & Selling Year of Top 100 Expensive Vehicles\n", size = 22)

plt.plot(desending_price['Year'].head(100), marker = 'o', label = "Manufactured Year", markerfacecolor = 'red')
plt.plot(desending_price['published_year'].head(100), marker = 'o', markersize=4, label = "Selling Year", markerfacecolor = 'red')

plt.ylabel("Year")

plt.grid()
plt.legend(loc = 5);

In [None]:
plt.figure(figsize=(20,8))

plt.title("Difference in Manufactured Year & Selling Year of Top 100 Inexpensive Vehicles\n", size = 22)

plt.plot(ascending_price['Year'].head(100), marker = 'o', label = "Manufactured Year", markerfacecolor = 'red')
plt.plot(ascending_price['published_year'].head(100), marker = 'o', markersize=4, label = "Selling Year", markerfacecolor = 'red')

plt.ylabel("Year")
plt.grid()
plt.legend(loc = 4);



<div style="color:black;
           display:fill;
           border-radius:5px;
           background-color:#6cdcf5;
           font-size:100%;
           font-family:Verdana;
           letter-spacing:0.7px">


<h2 style="text-align:center"> Published year of all the vehicle is 2021, Some vehicles are manufactured before 1960. And those are very rare... </h2>


</div>