<a href="https://colab.research.google.com/github/GZabalaG/Datasets_analysis/blob/main/Car_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cars

### Imports...

In [None]:
!pip install dython

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import os
from matplotlib import pyplot as plt
import seaborn as sns
from dython import nominal
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
audi = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/audi.csv')
bmw = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/bmw.csv')
ford = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/ford.csv')
hyundi = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/hyundi.csv')
mercedes = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/merc.csv')
skoda = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/skoda.csv')
toyota = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/toyota.csv')
vw = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/vw.csv')

cars = {
    'Audi':audi,
    'BMW':bmw,
    'Ford':ford,
    'Hyundi':hyundi,
    'Mercedes':mercedes,
    'Skoda':skoda,
    'Toyota':toyota,
    'VW':vw
}

In [None]:
for k, v in cars.items():
  print('\n', k, '\n\n')
  display(v)

## Datasets information

In [None]:
for k, v in cars.items():
  print('\n', k, '\n\n')
  display(v.describe())

In [None]:
for k, v in cars.items():
  print('\n', k, '\n\n')
  display(v.isnull().sum())

In [None]:
for k, v in cars.items():
  print('\n', k, '\n\n')
  display(v.dtypes)

In [None]:
for k, v in cars.items():
    print('\n', k)
    display(v.nunique())

In [None]:
def difference (list1, list2):
   list_dif = [i for i in list1 + list2 if i not in list1 or i not in list2]
   return list_dif

In [None]:
# Check columns names
columns=set([])
dif_cols=set([])

for k, v in cars.items():
    print('\n', k)
    ref_columns = set(v.columns)
    if not columns:
      columns = ref_columns.copy()
    else:
      columns = set(set(columns).intersection(ref_columns))
      dif_cols = dif_cols.union(difference(list(columns), list(ref_columns)))
    display(v.columns)

print('\nSame column names', columns)
print('\nSame column names', dif_cols)

Renaming Hyundi tax(£) column to tax

In [None]:
cars['Hyundi'] = cars['Hyundi'].rename(columns={'tax(£)': 'tax'})

In [None]:
cars['Hyundi'] 

## Datasets processing

Joining dataframes with a new column named brand

In [None]:
for k, v in cars.items():
  v.insert(loc=0, column='Brand', value=str(k))

In [None]:
train = pd.concat(frame for frame in [*cars.values()]).reset_index(drop=True)

In [None]:
train

## Dataset visualization

### Price vs Categorical (Distributions)

In [None]:
sns.set_theme(style="darkgrid")

In [None]:
g = sns.displot(
    train, 
    x='price', 
    row='transmission',
    height=3,
    aspect=6,
    stat='density'
)

g.fig.suptitle('Price per transmission', fontsize=20)
plt.xlabel("Price")

g.fig.subplots_adjust(top=0.92)

plt.show()

In [None]:
g = sns.displot(
    train, 
    x='price', 
    row='fuelType',
    height=3,
    aspect=6,
    stat='density'
)

g.fig.suptitle('Price per Fuel Type', fontsize=20)
plt.xlabel("Price")

g.fig.subplots_adjust(top=0.92)

plt.show()



In [None]:
g = sns.displot(
    train, 
    x='price', 
    row='Brand',
    height=3,
    aspect=6,
    stat="density"
)

g.fig.suptitle('Price per Brand', fontsize=20)
plt.xlabel("Price")

g.fig.subplots_adjust(top=0.92)

plt.show()

In [None]:
g = sns.displot(
    train, 
    x='price', 
    row='engineSize',
    height=3,
    aspect=6,
    stat="density"
)

g.fig.suptitle('Price per engine', fontsize=20)
plt.xlabel("Price")

g.fig.subplots_adjust(top=0.97)

plt.show()

### Regression plots. Price vs continous variables.

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(20, 20))

year_g = sns.regplot(
    data=train,
    x="year", 
    y="price",
    order=2, 
    line_kws={"color": "red"},
    ax=ax[0,0])

tax_g = sns.regplot(
    data=train,
    x="tax", 
    y="price",
    order=1, 
    line_kws={"color": "red"},
    ax=ax[0,1])

mpg_g = sns.regplot(
    data=train,
    x="mpg", 
    y="price",
    order=4, 
    line_kws={"color": "red"},
    ax=ax[1,0])

mileage_g = sns.regplot(
    data=train,
    x="mileage", 
    y="price",
    order=3, 
    line_kws={"color": "red"},
    ax=ax[1,1])

ax[0,0].set_xlim(1990, 2021)
ax[0,0].set_ylim(-1000, 200000)

ax[0,1].set_xlim(-10, 300)
ax[0,1].set_ylim(-1000, 200000)

ax[1,0].set_xlim(-10, 300)
ax[1,0].set_ylim(-1000, 200000)

ax[1,1].set_xlim(-100, 400000)
ax[1,1].set_ylim(-1000, 200000)

plt.show()

### Correlation

In [None]:
nominal.associations(train, nominal_columns=['Brand', 'model', 'transmission', 'fuelType', 'engineSize'], figsize=(20,20))

## Predictions...

### Some preprocessing:
- Brand and model together
- Dummy variables
- Standarization

In [None]:
train["Brand_model"] = train["Brand"] + '_' + train["model"]
train = train.drop(['Brand', 'model'], axis=1)
train

In [None]:
X = train.drop(['price'], axis=1)
y = train['price']
X

In [None]:
X = pd.get_dummies(X, columns=['Brand_model', 'transmission', 'fuelType'])
X

In [None]:
sc = StandardScaler()
X.iloc[:, 0:5] = sc.fit_transform(X.iloc[:, 0:5])
X

### Splitting train set and training the model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), 1)[:100])

In [None]:
rmsd = (np.sqrt(np.square(y_pred - y_test.values))).mean(axis=0)
print('Root mean square deviation:', rmsd)

We got problems with one prediction

In [None]:
rs = np.sqrt(np.square(y_pred - y_test.values))
print(rs[rs.argsort()[-3:][::-1]])

We can substitute this prediction with mean 

In [None]:
y_pred[rs.argsort()[-1:]] = y_pred.mean()

In [None]:
rmsd = (np.sqrt(np.square(y_pred - y_test.values))).mean(axis=0)
print('Root mean square deviation:', rmsd)

Around 10k... that's too much

Let's try random forest regression

In [None]:
rfr = RandomForestRegressor(n_estimators=100, random_state=0)
rfr.fit(X_train, y_train)

In [None]:
y_pred_rfr = rfr.predict(X_test)

In [None]:
print(np.concatenate((y_pred_rfr.reshape(len(y_pred_rfr), 1), y_test.values.reshape(len(y_test), 1)), 1)[:100])

In [None]:
rmsd_random_forest = (np.sqrt(np.square(y_pred_rfr - y_test.values))).mean(axis=0)
print('Root mean square deviation:', rmsd_random_forest)

Much better! :)