In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction ##
It is very difficult to determine the best price when you're purchasing a new car. There are multiple features that you have to look at to determine its actual worth. Throughout this notebook, we will try to determine the best models to predict a cars price.

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
import re
from nltk import word_tokenize

In [None]:
df = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv")

In [None]:
df.head(5)

In [None]:
df = df[[c for c in df if c not in ['year']] 
       + ['year']]

In [None]:
pd.set_option('float_format', '{:f}'.format)
df.describe()

In [None]:
print(df['seats'].isnull().value_counts())
print(df['engine'].isnull().value_counts())
print(df['mileage'].isnull().value_counts())
print(df['max_power'].isnull().value_counts())

In [None]:
#Create a function to convert 
fig=plt.figure()
def fix_null(column):
    df[column] = df[column].replace(np.nan, '00.00', regex=True)
    for h,i in enumerate(df[column]):
        try:
            df.loc[h, column] = re.findall(r"[-+]?\d*\.\d+|\d+", i)
        except:
            df.loc[h, column] = 00.00
    df[column] = df[column].astype('float')
    sns.kdeplot(data=df, x=column)
    plt.title('Density of Column Prior to Adjustment')
    plt.show()
    av_calc = df[df[column] > 0][column].median()
    df[column] = df[column].replace(0,av_calc, regex = True)



In some of the columns, there were some missing values. To adjust the missing values in these columns, I chose to update the missing values with the median values of the column. There are some outliers that will have an overly large effect on the mean of the columns.

In [None]:
for column in ['mileage', 'engine', 'max_power']:
    fix_null(column)

In [None]:
df['seats'] = df['seats'].replace(np.nan, 0, regex=True)
av_seats = round(df[df['seats'] > 0]['seats'].median())
df['seats'] = df['seats'].replace(0,av_seats, regex = True)

The next area of preprocessing that I wanted to look at was the the make of each car. In order to do this, I found a listing of car makes and uploaded the list into the notebook. By doing this, it should allow the model to pull in a more accurate prediction of the price. In order to do this, I tokenized the car names to find whether any of the words were included in my listing of makes.

In [None]:
import json

with open('/kaggle/input/car-makes/Car Manufacturers.json') as f:
  data = json.load(f)

print(data)

In [None]:
df['make'] = ''

for num, car in enumerate(df['name']):
    tokens = word_tokenize(car)
    for token in tokens:
        if token in data:
            df.loc[num,'make'] = token
        elif token == 'Maruti':
            df.loc[num,'make'] = 'Suzuki'

In [None]:
df.columns

Work done on the selling price of the car was the last part of the preprocessing that we needed to look at. In the below chart, we see the means prices of the cars that were sold. There is quite a wide variety among the selling price means among the car makes. Many of the means prices make sense at first glance (ex. Lexus, BMW, Audi have some of the higher resale values), but I would like to see if there are any significant outliers in these prices.

In [None]:
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter
fig, ax = plt.subplots(figsize=(15,10))
y_pos = np.arange(len(df['make'].unique()))
ax.barh(df['make'].unique(), df.groupby(['make']).selling_price.mean(), align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(sorted(list(df.make.unique())))
start, end = ax.get_xlim()

ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
ax.xaxis.set_ticks(np.arange(start, end, 500000))

ax.invert_yaxis()  # labels read top-to-bottom
ax.set_ylabel('Make')
ax.set_xlabel('Price')
plt.xticks(rotation=70)
ax.set_title('Mean Price by Make')

plt.show()

In [None]:
df.selling_price.describe()

Below we see a box plot showing the breakdown of prices over the overall datasets. I could drop some of the major outliers based on this graph, but there is a chance that a car make may be worth much more than the majority of the dataset which would lead me to completely cut out a certain make. In order to adjust for this, I will look at box plot by each of the makes to determine the outliers in the dataset.

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.boxplot(df['selling_price'], vert=False)
plt.title('Price Breakdown')
plt.xlabel('Price')
ax.tick_params(labelleft=False)   
ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))

ax.xaxis.set_ticks(np.arange(0, max(df['selling_price']), 500000))
plt.xticks(rotation=70)
plt.show()

We see below that there are a few major outliers, but the majority are not massively affecting the mean of the overall dataset. In order to clean up the larger outliers, I felt that dropping the amounts that are lower than 1 percent and higher than 99 percent.

In [None]:
df_list = []

for make in df['make'].unique():
    df_list.append(df[df['make'] == make].selling_price)
    
fig, ax = plt.subplots(figsize=(12,15))
ax.boxplot(df_list)

ax.set_xticklabels(df['make'].unique())
ax.yaxis.set_ticks(np.arange(0, max(df['selling_price']), 500000))
plt.setp( ax.xaxis.get_majorticklabels(), rotation=-45, ha="left" )
ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))


plt.show()

In [None]:
adj_df = pd.DataFrame()

for make in df['make'].unique():
    df_make = df[df['make'] == make]
    low = np.percentile(df_make.selling_price, 1)
    high = np.percentile(df_make.selling_price, 99)
    print(make, low, high, df_make.selling_price.count())
    if df_make.selling_price.count() > 10:
        adj_df = adj_df.append(df_make[(df_make['selling_price'] > low) & (df_make['selling_price'] < high)], ignore_index = True)
    else:
        adj_df = adj_df.append(df_make, ignore_index = True)

In [None]:
adj_df

### Visualization ###
The first area that I wanted to look at was how the data was broken down by categorical value. To do this I ran a some bar graphs to help review this. In most of the cases, there were usually two major variables in our categorical features that had the majority of cars.

fuel: Diesel and Petrol

seller_type: Individual and Dealer

transmission: Manual and Automatic

owner: First and Second

In [None]:
cat_col = ['fuel', 'seller_type', 'transmission', 'owner']
counter = 0
fig, axs = plt.subplots(1, 4, figsize=(15, 8), sharey=True)

for cat in cat_col:
    names = list(adj_df[cat].value_counts().keys())
    values = list(adj_df[cat].value_counts().values)
    axs[counter].bar(names, values)
    for i, v in enumerate(values):
        axs[counter].text(i, v, str(v), fontweight='bold', ha='center', rotation=70)
    axs[counter].set_title(cat)
    axs[counter].tick_params(axis='x', labelrotation=45)
    counter += 1

Next, I ran a pair plot to quickly see if there were any noticably linear relationships in our data. To add additional information, I wanted to look at seller_type to see if this would help in finding any interesting relationships. At first glance it seems like there could be something between the max_power/selling_price and engine/selling_price

In [None]:
plt.figure(figsize = (10,5))
sns.pairplot(adj_df, hue="seller_type", vars = ['selling_price', 'km_driven', 'mileage','engine', 'max_power', 'year'], diag_kind="hist")
plt.title("Fuel Type of Car Per Year")
plt.show()

After looking at the pair plot, I ran a couple of scatterplots with the line of best fit. The first was a comparison between the selling price and max power, with the transmission categorical feature added for colouring. The second comparison depicted the selling price to the engine, with the owner categorical feature as an added variable. Out of the two, it seems like the max_power and selling price for automatic vehicles have a pretty positive relationship.

In [None]:
fig = plt.figure(figsize = (10,8))
sns.lmplot(data=adj_df, x="max_power", y="selling_price", hue="transmission", height=5)
plt.title("Max Power to Price")
plt.show()

In [None]:
fig = plt.figure(figsize = (10,8))
sns.lmplot(data=adj_df, x="engine", y="selling_price", hue="owner", height=5)
plt.show()

### Additional Preprocessing for Models ###
When running models it is always smart to normalize the data to ensure that a certain feature is not given a higher significance purely because it is a higher value than the other features. In addition, to include the categorical variables in the models, I needed to add dummies to provide them with binary values. 

In [None]:
from sklearn import preprocessing


for col in list(adj_df.columns[2:]):
    if (adj_df[col].dtype == 'int64') or (adj_df[col].dtype == 'float64'):
        x = np.array(adj_df[col]) #returns a numpy array
        x = np.reshape(x,(-1,1))
        min_max_scaler = preprocessing.MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(x)
        adj_df[col] = x_scaled

In [None]:
#drop name and torque from the dataset so it can be added into the models. 
norm_df = adj_df.drop(['torque', 'name'], axis=1)

In [None]:
for col in norm_df.columns[1:]:
    if (norm_df[col].dtype == 'object'):
        norm_df = pd.get_dummies(norm_df, columns=[col], prefix = [col])

In [None]:
pd.set_option("display.max_columns", 101)
norm_df

In [None]:
#Use the train_test_split to split the data into a training and testing dataset.
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(norm_df.iloc[:,1:len(norm_df.columns)], norm_df.iloc[:, 0:1], test_size=.2, random_state=10)

In [None]:
#I decided to use the linear, Decision Tree and KNeighbors regression models. 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
linreg = LinearRegression()
linreg.fit(x_train, y_train)
pre_linear = linreg.predict(x_test)

In [None]:
r_sq = linreg.score(x_train, y_train)
print('Coefficient of determination:', r_sq)

For the Decision Tree and K Nearest Neighbour, I wanted to run a for loop to determine the best tradeoff between depth/number of neighbors and speed of the model. To determine this number, I ran a couple of line plots and used the elbow method to determine the best value for each.

In [None]:
#Ran a for loop to determine the best tradeoff between number of branches and accuracy of the results
score_list = []

for i in range(2,20):
    decreg = DecisionTreeRegressor(max_depth = i)
    decreg.fit(x_train, y_train)
    pre_tree = decreg.predict(x_test)
    r_sq = decreg.score(x_test, y_test)
    score_list.append(r_sq)

In [None]:
fig = plt.figure()
plt.plot(list(range(2,20)), score_list)
plt.title("Best Depth For The Tree")
plt.xticks(list(range(2,20)))
plt.ylabel("R-Squared Score")
plt.xlabel("Depth of Tree")
plt.grid()
plt.show()

In [None]:
decreg = DecisionTreeRegressor(max_depth = 11)
decreg.fit(x_train, y_train)
pre_tree = decreg.predict(x_test)
r_sq = decreg.score(x_test, y_test)
print('coefficient of determination:', r_sq)

In [None]:
score_list = []

for i in range(2,20):
    KNreg = KNeighborsRegressor(n_neighbors = i)
    KNreg.fit(x_train, y_train)
    pre_KN = KNreg.predict(x_test)
    r_sq = KNreg.score(x_test, y_test)
    score_list.append(r_sq)

In [None]:
fig = plt.figure()
plt.plot(list(range(2,20)), score_list)
plt.title("Best Number of Neighbors")
plt.xticks(list(range(2,20)))
plt.ylabel("R-Squared Score")
plt.xlabel("Number of Neighbors")
plt.grid()
plt.show()

In [None]:
KNreg = KNeighborsRegressor(n_neighbors = 2)
KNreg.fit(x_train, y_train)
pre_KN = KNreg.predict(x_test)
r_sq = KNreg.score(x_test, y_test)
print('coefficient of determination:', r_sq)

In the above models, we see the following R squared values:

Linear Regression: 85.67%

Decision Tree: 92.92%

KNearest Neighbors: 94.48%

It seems they are all relatively accurate; however, I chose to move forward with additional testing for the Decision Tree and KNearest Neighbors because of their tendency to overfit. To ensure that they were not overfit, I ran a KFold random sample to see how they performed.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
dec_scores = cross_val_score(decreg, x_train, y_train, scoring='r2', cv=folds)
KN_scores = cross_val_score(KNreg, x_train, y_train, scoring='r2', cv=folds)
dec_sc = dec_scores.mean()
KN_sc = KN_scores.mean()  
print("New Decision Tree Coefficient of Determination: ", dec_sc)
print("New KNeighbors Coefficient of Determination: ", KN_sc)

We see that both provide a relatively R-squared value, with K Nearest Neighbors having the highest value of the two. Next I wanted to see if there were some features that were less useful to the Decision Tree model (I wasn't able to find something similar for KNeighbors). When we look below we see that after taking only the top 15 features, the Decision Tree R-squared actually increases which is likely due to the loss of some of the negatively correlated features. It was interesting that outside of the numerical values (km_driven, mileage, engine, etc.), the next most important features were mostly the makes of the cars. 

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(estimator = DecisionTreeRegressor(max_depth = 11), n_features_to_select=15)
x_rfe = rfe.fit_transform(x_train,y_train)

decreg.fit(x_rfe,y_train)              
temp = pd.Series(rfe.support_,index = x_train.columns)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

In [None]:
adj_r_sq = decreg.score(x_rfe, y_train)
print('Reduced Features Coefficient of Determination: ', adj_r_sq)

## Conclusion ##
After going through all of our models, it seems like the information provided for each of these cars were a strong indicator of the price. It seems like our reduced features model would provide us with the strongest R-squared; however, if our dataset continued to expand we may start to see that other features would become more important in our predictions. I would recommend using either KNeighbors or Decision Tree (Pre feature selection) to determine the price. 