In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We'll use the BMW dataset.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
import matplotlib.pyplot as plt

import numpy as np

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
data = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv')
data.head()

In [None]:
data.isnull().sum()

We don't have any null values which is a good sign.

In [None]:
data.describe()

Let's check the percentage of different car models in the dataset.

In [None]:
(data['model'].value_counts()/len(data))*100

In [None]:
plt.figure(figsize=(24,5))
plt.xticks(rotation = 20)
sns.barplot(x = data['model'], y = data['price'], data = data, hue = data['transmission'])

In [None]:
transmission_counts = dict(data['transmission'].value_counts())
plt.title('Transmission Distribution', size = 20)
plt.pie(transmission_counts.values(), labels=transmission_counts.keys(), textprops={'size' : 14}, autopct='%1.2f%%')
plt.show()

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = data['year'], y = data['price'])

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x = data['mileage'], y = data['price'], hue = data['year'])

In [None]:
sns.countplot(x = data['fuelType'])

In [None]:
plt.hist(data['engineSize'], bins=5, color='brown')
plt.show()

In [None]:
sns.pairplot(data = data)

In [None]:
corr = data.corr()
corr_dataFrame = corr['price'].sort_values(ascending=False).to_frame()
s = corr_dataFrame.style.background_gradient(cmap = 'coolwarm')
s

# Handling Outliers

In [None]:
data['car_age'] = 2021 - data['year']
data = data.drop(columns = ['year'])

numerical_variables = [var for var in data.columns if data[var].dtype != 'O']
print('There are {} numerical variables'.format(len(numerical_variables)))
print('The numerical variables are: ', numerical_variables)

plt.figure(figsize=(12,8))
plt.title('Numerical Variables in BMW Dataset')
data[numerical_variables].boxplot(color = 'brown')
plt.show()

In [None]:
data[data['price'] >= 90000]

In [None]:
data[data['mileage'] >= 200000]

In [None]:
i1 = data[data.mileage >= 200000].index
i2 = data[data.price >= 90000].index
data = data.drop(i1)
data = data.drop(i2)

Now using One Hot Encoding on categorical data.
I am using pd.get_dummies for the same.

In [None]:
data_expanded = pd.get_dummies(data)
data_expanded.head()

In [None]:
std = StandardScaler()
data_expanded_std = std.fit_transform(data_expanded)
data_expanded_std = pd.DataFrame(data_expanded_std, columns = data_expanded.columns)
print(data_expanded.shape)

In [None]:
data_expanded_std.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_expanded_std.drop(columns = ['price']), data_expanded_std[['price']])
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
def test_models(models, x_train, x_test, y_train, y_test):

    np.random.seed(42)

    model_mse = {}
    model_mape = {}    
    model_r2 = {}

    for name, model in models.items():
        model.fit(x_train, y_train.values.ravel())
        y_preds = model.predict(x_test)
        model_mse[name] = mean_squared_error(y_test, y_preds)
        model_mape[name] = np.mean(np.abs((np.array(y_test)-np.array(y_preds))/np.array(y_test)))*100
        model_r2[name] = r2_score(y_test, y_preds)

    model_mse = pd.DataFrame(model_mse, index = ['MSE']).transpose()
    model_mse = model_mse.sort_values('MSE', ascending=False)

    model_mape = pd.DataFrame(model_mape, index = ['MAPE']).transpose()
    model_mape = model_mape.sort_values('MAPE', ascending=False)

    model_r2= pd.DataFrame(model_r2, index = ['R2']).transpose()
    model_r2 = model_r2.sort_values('R2')

    return model_mse, model_mape, model_r2

In [None]:
models = {'LinearRegression' : LinearRegression(),
          'KNeighborsRegressor': KNeighborsRegressor(),
          'DecisionTreeRegressor': DecisionTreeRegressor(),
          'RandomForestRegressor':RandomForestRegressor(),
          'GradientBoostingRegressor': GradientBoostingRegressor(),
          'SVM': SVR()
        }

In [None]:
model_mse,model_mape,model_r2 = test_models(models, x_train, x_test, y_train, y_test)

In [None]:
model_mse

In [None]:
model_mape

In [None]:
model_r2

# Using Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
model = Sequential()

model.add(Dense(37, activation = 'relu'))
model.add(Dense(24, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(1))

model.compile(
    optimizer = 'adam',
    loss = tf.keras.losses.MSE
)

In [None]:
history = model.fit(x = x_train, y =  y_train, epochs = 200)

In [None]:
y_preds = model.predict(x_test)
y_preds = pd.DataFrame(y_preds)
r2_nn_result = r2_score(y_test, y_preds)

In [None]:
r2_nn_result

# Conclusion
RandomForestRegressor and Neural Nwtwork gave the best results on the Dataset with an R2 score of nearly 95%.