In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Importing the Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re
import warnings
warnings.filterwarnings("ignore") 

## **Importing the dataset**

In [None]:
dataset = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv')
dataset.head()

## **Basic Data Wrangling**

In [None]:
print('The shape of the dataset is: ',dataset.shape)

In [None]:
dataset.info()

In [None]:
dataset.describe(include = 'all')

In [None]:
dataset.isnull().sum()

## **Exploratory Data Analysis**

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=dataset['owner'], name="Ownership", textinfo='label+percent'),
              1, 1)
fig.add_trace(go.Pie(labels=dataset['seller_type'], name="Seller Type",textinfo='label+percent'),
              1, 2)

fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Seller Profile",
    annotations=[dict(text='Ownership', x=0.17, y=0.5, font_size=15, showarrow=False),
                 dict(text='Seller Type', x=0.83, y=0.5, font_size=15, showarrow=False)])
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=dataset['fuel'], name="Fuel Type",textinfo='label+percent'),
              1, 1)
fig.add_trace(go.Pie(labels=dataset['transmission'], name="Transmission", textinfo='label+percent'),
              1, 2)

fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Basic Car Information",

    annotations=[dict(text='Fuel Type', x=0.17, y=0.5, font_size=15, showarrow=False),
                 dict(text='Transmission', x=0.83, y=0.5, font_size=15, showarrow=False)])
fig.show()

In [None]:
most_sold = dataset.name.value_counts()[:10]
px.bar(data_frame = most_sold, x = most_sold.index, y = most_sold, labels= {'index':'Car Name', 'y': 'Quantity Sold'}, 
       title = 'Most sold cars over the past 20 years')

In [None]:
engine = dataset.engine.value_counts()[:10]
px.bar(x = engine.index, y = engine, labels = {'x': 'Engine Type', 'y': 'Count'}, title = 'Most Popular Engine Types')

In [None]:
power = []
mileage = []

for i in range(0, 8128):
    temp = str(dataset['mileage'][i])
    temp = re.sub('[^0-9.]', '', temp)
    mileage.append(temp)
while('' in mileage) :
    mileage.remove('')
    mileage.sort()

for i in range(0, 8128):
    temp = str(dataset['max_power'][i])
    temp = re.sub('[^0-9.]', '', temp)
    power.append(temp)
while('' in power) :
    power.remove('')
    power.sort()
power = power[:len(power)-5]

px.line(x = mileage, y = power, title = "Mileage vs. Power", labels = {'x': 'Mileage in kmpl', 'y': 'Power in bhp'})

In [None]:
df = dataset.groupby(['year']).mean()
px.line(data_frame = df, x = df.index, y = 'selling_price', labels = {'year': 'Year', 'selling_price': 'Average Selling Price'},
       title = 'Average Selling Price Per Year')

In [None]:
px.line(data_frame = df, x = df.index, y = 'km_driven', labels = {'year': 'Year', 'km_driven': 'Average Distance Travelled in kms'},
       title = 'Average Distance Travelled Per Year')

## **Feature Engineering**

### Label encoding all the necessary columns

In [None]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
dataset['fuel'] = labelEncoder.fit_transform(dataset['fuel'])
dataset['transmission'] = labelEncoder.fit_transform(dataset['transmission'])
dataset['owner'] = labelEncoder.fit_transform(dataset['owner'])
dataset['seller_type'] = labelEncoder.fit_transform(dataset['seller_type'])

### Removing all the 'Nan' values and dropping useless columns

In [None]:
dataset.dropna(inplace = True)
dataset.reset_index(inplace = True, drop = True)
dataset.drop(['name', 'torque'], inplace = True, axis = 1)

### Cleaning certain useful columns such as: mileage, engine, max_power

In [None]:
lst, lst1, lst2 = [], [], []
for i in range(0, 7906):
    lst.append(re.sub('[^0-9.]', '', str(dataset['mileage'][i])))
    lst1.append(re.sub('[^0-9.]', '', str(dataset['engine'][i])))
    lst2.append(re.sub('[^0-9.]', '', str(dataset['max_power'][i])))
new_lst = list(map(float, lst))
new_lst1 = list(map(float, lst1))
new_lst2 = list(map(float, lst2))
dataset['mileage'] = new_lst
dataset['engine'] = new_lst1
dataset['max_power'] = new_lst2
dataset.head()

### Pair plot showing relation between various columns

In [None]:
sns.pairplot(dataset, hue = 'fuel')
plt.show()

### Heatmap showing correlation between various columns

In [None]:
plt.figure(figsize=(18, 10))
sns.heatmap(dataset.corr(), linecolor = 'white', linewidths = 1, cmap = 'coolwarm', annot=True)
plt.show()

## **Spitting the Dataset**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.array(dataset.drop('selling_price', axis = 1)), np.array(dataset['selling_price']), 
                                                    test_size = 0.25, random_state = 0)

## **Applying Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 400, criterion = 'mse')
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

## **Making new Predictions**

In [None]:
y_pred = rf.predict(X_test)
df = pd.DataFrame({"Original": y_test, "Predicted": y_pred})
df

In [None]:
plt.scatter(y_test, y_pred)

## End Notes:
We can do feature scaling by applying StandardScalar()

### **Thank you :)**