# Our goal is to build a model that will allow us to predict the price of the car
## To build our model we have access to 13 files each containing information about given type of car. In our analysis we will ignore unclean cclass.csv and unclean focus.csv - that means that we will analyse 11 different types of cars
## We will start by performing EDA (Exploratory Data Analysis) and fixing any problems that we encounter in our data
## After that we will perform one hot encoding and standarization on our dataset
## We will build a model based on random forest that will predict the price of the car. We will use RMSE and R^2 to evaluate our model
## We will end by building a class that will allow the user to input their own parameters and get the predicted price
### This is the information that we have about the columns in our dataset:
### 1. model - model of the car
### 2. year - registration year
### 3. price - price in £
### 4. transmission - type of gearbox
### 5. mileage - distance used
### 6. fuelType - engine fuel
### 7. tax - road tax
### 8. mpg - miles per gallon
### 9. engineSize - size in litres

# 1. Importing basic libraries for data analysis and insuring if data could be read properly

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import copy
import os
from IPython.display import display

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 2. Basic info about our data

In [None]:
# Reading all data into the separate files (we ignore unclean data)
files = os.listdir('/kaggle/input/used-car-dataset-ford-and-mercedes')
df_dict = {}
for file in files:
    if file != 'unclean cclass.csv' and file != 'unclean focus.csv':
        df_dict[file[:-4]] = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/' + file)

In [None]:
# Quickly check how every dataframe looks
for name, df in df_dict.items():
    print('{name} df'.format(name=name))
    display(df.head())
    print()

### We can see two main problems
### Problem no 1 - the number of columns doesn't match for every df. tax and mpg is not available for the first 2 dataframes. We will fix this problem by adding two missing columns to these dataframes and assigning -1 to every row in them
### Problem no 2 - one column is misnamed (hyundi df). We will rename this column later

In [None]:
# Checking for missing values
print("Number of missing values in the files: {no}".format(no=sum([df.isna().sum().sum() for df in df_dict.values()])))

In [None]:
print("Number of rows for every df")
for name, df in df_dict.items():
    print('{name} df no rows: {no_rows}'.format(name=name, no_rows=df.shape[0]))

# 3. Preparing data for EDA (Exploratory Data Analysis)
### Before EDA we will need to do several things
### 1. Deal with all the problems mentioned earlier (missing columns and mispelled column)
### 2. Add the model column (this info will be useful for our model later)
### 3. Separate data into training and test set (it's the most important step, we only perform analysis on the training set)
### 4. Concatenate the training sets into one huge training set (that will be used to train our model)
### 5. Concatenate the test sets into one huge test set (that will be used to train our model)

In [None]:
# Dealing with problem 1
df_dict['cclass'].insert(6, 'tax', -1)
df_dict['cclass'].insert(7, 'mpg', -1)
df_dict['focus'].insert(6, 'tax', -1)
df_dict['focus'].insert(7, 'mpg', -1)
df_dict['hyundi'].rename(columns={'tax(£)' : 'tax'}, inplace=True)

In [None]:
df_dict['cclass'].head()

In [None]:
# Dealing with problem 2
for name, df in df_dict.items():
    df.insert(0, 'type', name)

In [None]:
df_dict['cclass'].head()

In [None]:
# Dealing with problem 3, 4 and 5
from sklearn.model_selection import train_test_split

In [None]:
# Used for EDA for every car separately
df_train_dict = {}
# Used for model
df_train = pd.DataFrame()
df_test = pd.DataFrame()
for name, df in df_dict.items():
    # (we will later separate price from the rest of the dataset)
    train, test = train_test_split(df, test_size=0.3, random_state=42)
    df_train = pd.concat([df_train, train])
    df_test = pd.concat([df_test, test])
    # Drop the type because we don't need it for EDA for every car
    train = train.drop('type', axis=1)
    df_train_dict[name] = train
    
# Reset the index of our datasets so that later they wouldn't cause trouble
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
len(df_train_dict)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.describe()

In [None]:
df_train.dtypes

### Our last step will be reshufling our training data so that our model would learn better

In [None]:
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_train.head()

# 4. EDA - every car separately
## In this part we will analyse every car separately. That means that for every parameters that we want to analyse we will plot 11 charts
### We will mainly try to search for answers for these questions:
### 1. What is the distribution of price?
### 2. What influences the price (and how much)?

In [None]:
fig, axs = plt.subplots(nrows=6, ncols=2, figsize=(24*2, 12*6))
fig.suptitle('Histogram of the price', fontsize=40, y=1.05)

i = 0
j = 0
for name, df in df_train_dict.items():
    axs[i][j].hist(df['price'], bins=40)
    axs[i][j].set_title('Price for: ' + str(name), fontsize=36)
    axs[i][j].set_xlabel('Price', fontsize=30)
    axs[i][j].tick_params(axis='x', labelsize=24)
    axs[i][j].tick_params(axis='y', labelsize=24)
    
    j += 1
    if j == 2:
        j = 0
        i += 1

axs[-1,-1].set_axis_off()
plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0.4, hspace=0.4)
plt.show()

### Most of the cars cost around 10000-20000 pounds. We can see that for some types the average price is higher than for other cars. That means that including type in our datasets was a good thing because we added an important information
## Before we continue we will create two functions, one to create barplot, second to create a lineplot of our data. We will use a lot of these plots in our EDA so it's important to automate this whole process without copying a lot of code

In [None]:
def create_barplot(
    df_dict: dict,
    name_col: str,
    nrows: int = 6,
    ncols: int = 2,
    xlabelsize: int = 24,
    ylabelsize: int = 24
):
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(24*ncols, 12*nrows))
    fig.suptitle('Average price of the car for this parameter: {parameter}'.format(parameter=name_col), fontsize=40, y=1.05)
    
    # Create plot for every type of car. a will be used to assign a proper plot to the proper axis
    for a, (name, df) in enumerate(df_dict.items()):
        i = a // 2 
        j = a % 2
        
        x = sorted(df[name_col].unique())
        y = df.groupby(name_col).mean()['price']
        axs[i][j].bar(x, y)
        
        axs[i][j].set_title(name, fontsize=36)
        axs[i][j].tick_params(axis='x', labelsize=xlabelsize)
        axs[i][j].tick_params(axis='y', labelsize=ylabelsize)
        axs[i][j].set_xlabel(name_col, fontsize=30)
        axs[i][j].set_ylabel('Price', fontsize=30)
    
    axs[-1,-1].set_axis_off()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0.4, hspace=0.4)
    plt.show()

In [None]:
def create_lineplot(
    df_dict: dict,
    name_col: str,
    nrows: int = 6,
    ncols: int = 2,
    xlabelsize: int = 24,
    ylabelsize: int = 24
):
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(24*ncols, 12*nrows))
    fig.suptitle('Average price of the car for this parameter: {parameter}'.format(parameter=name_col), fontsize=40, y=1.05)
    
    for a, (name, df) in enumerate(df_dict.items()):
        i = a // 2 
        j = a % 2
        
        x = sorted(df[name_col].unique())
        y = df.groupby(name_col).mean()['price']
        axs[i][j].plot(x, y)
        
        axs[i][j].set_title(name, fontsize=36)
        axs[i][j].tick_params(axis='x', labelsize=xlabelsize)
        axs[i][j].tick_params(axis='y', labelsize=ylabelsize)
        axs[i][j].set_xlabel(name_col, fontsize=30)
        axs[i][j].set_ylabel('Price', fontsize=30)

    axs[-1,-1].set_axis_off()
    plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0.4, hspace=0.4)
    plt.show()

In [None]:
create_barplot(df_train_dict, 'model', xlabelsize=8)

### We can see that in some cases the type of car heavily influences the final price.
### What we also can notice that info about the model is useless for cclass and focus cars because there is only one model in these cars

In [None]:
create_lineplot(df_train_dict, 'year')

### We can see that that younger cars have higher price (the only exception being very old exclusive cars). There are also a few anomalies that we can notice
### Firstly Ford has a price for the car that was registered in 2060. It's probably a typo in the dataset. Luckily it doesn't look that we have many of these typos in our dataset so we don't need to worry that they will cause any problems later
### We can also notice that for merc and vauxhall we have a straight line between the year 1970 and around year 2000. It's very unlikely that there is a linear correlation between the year and price. We will check if this straight line is the result of there being no cars registered between 1971 and around 1999

In [None]:
df_dict['ford'][df_dict['ford']['year'] == 2060]

In [None]:
df_dict['merc'][df_dict['merc']['year'] < 1995]

In [None]:
df_dict['vauxhall'][df_dict['vauxhall']['year'] < 1995]

In [None]:
create_barplot(df_train_dict, 'transmission')

### The transmission on it's own doesn't seem to be correlated to the price of the cars

In [None]:
create_lineplot(df_train_dict, 'mileage')

### We can see that in most cases higher mileage means lower price

In [None]:
create_barplot(df_train_dict, 'fuelType')

### Fuel type doesn't seem to be directly correlated to the final price

In [None]:
create_lineplot(df_train_dict, 'tax')

### There is no visible correlation with tax and price

In [None]:
create_lineplot(df_train_dict, 'mpg')

### The straight lines in our graph are probably the result of having a very small amount of cars with a very high mpg.
### We can see that for some type of cars, cars with high mpg cost less than cars with low mpg.

In [None]:
create_lineplot(df_train_dict, 'engineSize')

### The engine size seems to be correlated to the price of the car (we can see peaks for some engine sizes) although it's not a linear correlation.

In [None]:
fig, axs = plt.subplots(nrows=6, ncols=2, figsize=(24*2, 12*6))
fig.suptitle('Heatmap for every type of car', fontsize=40, y=0.9)

for a, (name, df) in enumerate(copy.deepcopy(df_dict).items()):
    i = a // 2 
    j = a % 2
    if name == 'cclass' or name == 'focus':
        df.drop(['tax', 'mpg'], axis=1, inplace=True)
        location = [0.5, 1.5, 2.5, 3.5]
        
    else:
        location = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
        
    heatmap = axs[i][j].pcolormesh(df.corr(method='spearman'))
    fig.colorbar(heatmap, ax=axs[i,j])
    axs[i][j].set_title(name, fontsize=36)
    axs[i][j].xaxis.set_major_locator(matplotlib.ticker.FixedLocator(location))
    axs[i][j].yaxis.set_major_locator(matplotlib.ticker.FixedLocator(location))
    axs[i][j].set_xticklabels(df.select_dtypes(exclude='object').columns, fontsize=30)
    axs[i][j].set_yticklabels(df.select_dtypes(exclude='object').columns, fontsize=30)
    
axs[-1,-1].set_axis_off()
plt.show()

## Final conclusion - these are the things that we can say about our data:
## 1. Year - younger cars cost more (there are some exceptions to this rule)
## 2. Mileage - cars with low mileage cost a lot more than cars with high milleage
## 3. MPG / Engine Size - for some type of cars these are factors that heavily influence the price while for others these are not very important factors.
## 4. Tax - this factor doesn't influence the price of the car.
## 5. Model / Transmission / Fuel Type - Sometimes these factors heavily influence the final price of our car (for instance G Class Merc costs a lot more that other models) while in other cases it's not a very important factor (for instance fuel type is not an important factor in Merc cars)

# 5. EDA - training data
## We will perform basic EDA on our whole training data

In [None]:
plt.figure(figsize=(24, 12))
plt.hist(df_train['price'], bins=40)
plt.title('Histogram of price', fontsize=36)
plt.tick_params(axis='x', labelsize=24)
plt.tick_params(axis='y', labelsize=24)
plt.show()

In [None]:
plt.figure(figsize=(24, 12))
x = sorted(df_train['type'].unique())
y = df_train.groupby('type').mean()['price']
plt.bar(x, y)
plt.title('Average price for every type of car', fontsize=36)
plt.ylabel('Price', fontsize=30)
plt.tick_params(axis='x', labelsize=24)
plt.tick_params(axis='y', labelsize=24)
plt.show()

### We can notice that some type of cars have much lower price than others. For instance Ford cars are a lot cheaper than Merc cars

# 6. Preparing data for our model - OneHotEncoding and standarization
### We will use one hot encoding on categorical variables and standarization on non-categorical variables so that our model is fed with proper data
### But firstly we will seperate price from the rest of the dataset because we don't need to standarize the predicted values

In [None]:
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']
X_test = df_test.drop('price', axis=1)
y_test = df_test['price']

In [None]:
X_train.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
# One hot encode and standarize the dataset
class DatasetEncoderAndStandarization:
    def __init__(self):
        self.preprocessor = None
        
    def fit(self, df):
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), df.select_dtypes(exclude='object').columns),
                ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), df.select_dtypes(include='object').columns)
            ]
        )
        
        self.preprocessor.fit(df)
        
    def transform(self, df):
        if self.preprocessor is None:
            print("No preprocessor")
            return
        
        return self.preprocessor.transform(df)

In [None]:
deas = DatasetEncoderAndStandarization()
deas.fit(X_train)
X_train = deas.transform(X_train)
X_test = deas.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

# 7. Model - random forest
### We will use a random forest as our model. We will try to use the most optimal parameters to get the best results from our model

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
regr = RandomForestRegressor(n_estimators=20, max_depth=20, max_features=36, random_state=42)
regr.fit(X_train, y_train)
print("RMSE for random forest (training set) ", mean_squared_error(y_train, regr.predict(X_train), squared=False))

In [None]:
predictions = regr.predict(X_test)
rmse_random_forest = mean_squared_error(y_test, predictions, squared=False)
r2_random_forest = r2_score(y_test, predictions,)
print("RMSE for random forest (test set) ", rmse_random_forest)
print("R^2 for random forest (test set)", r2_random_forest)

### We can see that we have a pretty good score on our test set and a good result on our training set 
### That means that we can use this model to predict the price of the new car

# 8. Input a car and get the predicted price
## We will build a simple class that will allow the user to input parameters for his own car and get the predicted price

In [None]:
# Input every parameter of the car and get the predicted price
class Predictor:
    def __init__(self, df, preprocessor, model):
        self.df = df
        self.preprocessor = preprocessor
        self.model = model
        
        self.types = df['type'].unique()
        self.models_car = df['model'].unique()
        self.min_year = df['year'].min() - 5
        self.transmissions = df['transmission'].unique()
        self.max_mileage = df['mileage'].max() + 100000
        self.fuel_types = df['fuelType'].unique()
        self.max_tax = df['tax'].max() + 20
        self.max_mpg = df['mpg'].max() + 10
        self.max_engine_size = df['engineSize'].max() + 1
      
    # Input all the needed information and get the info about the price of the car
    def give_input_and_get_predicted_price(self):
        try:
            type_car = input("Input type of car (for instance bmw): ") 
            # We put extra space because in our dataset there is always an extra space before every model
            model_car = ' ' + input("Input model (for instance 1 Series): ")
            year = int(input("Input year the car was registered (for instance 2001): "))
            transmission = input("Input the transmission (for instance Semi-Auto): ")
            mileage = int(input("Input mileage of the car (for instance 2501): "))
            fuel_type = input("Input fuel type (for instance Petrol): ")
            tax = int(input("Input tax (for instance 150). For cclass and focus input -1: "))
            mpg = float(input("Input mpg (for instance 55.5). For cclass and focus input -1: "))
            engine_size = float(input("Input engine size (for instance 1.0): "))
            price = int(input("Input the price of the car (for instance 18022)"))
        
        except ValueError:
            print("Incorrect type of data. Input data again")
            return
        
        print()
        
        df = pd.DataFrame([[type_car, model_car, year, transmission, mileage, fuel_type, tax, mpg, engine_size]], columns=self.df.columns)
        self.check_for_correct_input(df)
        df = self.preprocessor.transform(df)
        predicted_price = int(self.model.predict(df))
        
        print()
        
        print('Predicted price: ', predicted_price)
        print('Actual price: ', price)
        print('Difference: ' + str(predicted_price-price))
        
    # Will only return warnings if user specifies something that deviates from what we have in our whole dataset
    def check_for_correct_input(self, df):
        if df['type'][0] not in self.types:
            print('WARNING! Specified type does not exist in the dataset. You may get unusual results')
            
        if df['model'][0] not in self.models_car:
            print('WARNING! Specified model does not exist in the dataset. You may get unusual results')
            
        if df['year'][0] > 2021 or df['year'][0] < self.min_year:
            print('WARNING! Specified year deviates from the dataset. You may get unusual results')
            
        if df['transmission'][0] not in self.transmissions:
            print('WARNING! Specified transmission does not exist in the dataset. You may get unusual results')
            
        if df['mileage'][0] < 0 or df['mileage'][0] > self.max_mileage:
            print('WARNING! Specified mileage deviates from the dataset. You may get unusual results')
            
        if df['fuelType'][0] not in self.fuel_types:
            print('WARNING! Specified fuel type does not exist in the dataset. You may get unusual results')
            
        # Because cclass and focus don't have tax and mpg column we will need to deal with them separately
        if df['type'][0] == 'cclass' or df['type'][0] == 'focus':
            if df['tax'][0] != -1:
                print('WARNING! Specified tax deviates from the dataset. You may get unusual results')
                
            if df['mpg'][0] != -1:
                print('WARNING! Specified mpg deviates from the dataset. You may get unusual results')
                
        else:
            if df['tax'][0] < 0 or df['tax'][0] > self.max_tax:
                print('WARNING! Specified tax deviates from the dataset. You may get unusual results')
                
            if df['mpg'][0] < 0 or df['mpg'][0] > self.max_mpg:
                print('WARNING! Specified mpg deviates from the dataset. You may get unusual results')
        
        if df['engineSize'][0] < 0 or df['engineSize'][0] > self.max_engine_size:
            print('WARNING! Specified engine size deviates from the dataset. You may get unusual results')

In [None]:
predictor = Predictor(pd.concat([df_train, df_test]).drop('price', axis=1), deas, regr)

### Now we will test our predictor on the 20-th car that we have in our test dataset

In [None]:
df_test.iloc[20]

In [None]:
predictor.give_input_and_get_predicted_price()

### We have managed to build a successful predictor that will be able to predict the price of every car specified by user