In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max.columns', None)
%matplotlib inline

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import du dataset
# from google.colab import files
# uploaded = files.upload()

# Local
# This data is composed of multiple datasets for each brand of car
# We firstly need to get all those datsets in one plca befor deciding of merging them all together
data_path= '/kaggle/input/used-car-dataset-ford-and-mercedes/'
datasets_dict= {}

for data in os.listdir(data_path):
    if 'unclean' not in data:
        datasets_dict[data.replace(".csv", "")] = pd.read_csv(f'{data_path}/{data}')

print(datasets_dict.keys())

In [None]:
# We will then check if all the datatsets have the sames columns
data_set_columns= [dataset.columns for dataset in datasets_dict.values()]
[print(sorted(set(columns))) for columns in data_set_columns]

In [None]:
# We need to add a column "manufacturer" on each dataset to keep track of it once they're all merged
manufacturers_dict={
    "focus": "ford",
    "cclass": "mercedes",
    "hyundi": "hyundai",
    "merc": "mercedes",
    "vw": "Volkswagen"
}
for manufacturer, dataset in datasets_dict.items():
    if manufacturer in manufacturers_dict.keys():
      dataset["Manufacturer"]= manufacturers_dict[manufacturer]
    else:
      dataset["Manufacturer"]= manufacturer

# Merging all datasets
- We will use the append() method to do so
    - Some columns describe the same variables but are not named the same way
    - We will therefore have to complete one of the column with the values of the other column

In [None]:
# We merge all datasets together
full_df= pd.concat(
    list(datasets_dict.values()),
    ignore_index= True)

In [None]:
full_df.head()

In [None]:
# Datset columns
full_df.columns

In [None]:
# Infos
full_df.info()

# EDA
- **Target variable**: "price"
- **Shape**: (118150, 17)
- **Missing values**:
  - Clearly the tax(£) with **more than 90% of missing values**  
- **Values types**

In [None]:
# We will work on a copy of this data set
df= full_df.copy()
print(f"full_df: {full_df.shape}, Copy: {df.shape}")

In [None]:
df.describe().T

- We do not see any high coeficient of variance

In [None]:
skew = df.describe().T
skew['coef']=skew['std']/skew['mean']
skew

## Missing values

In [None]:
# Heatmap to visualize the empty columns
plt.figure(figsize=(15,10))
sns.heatmap(df.isnull())

In [None]:
# Percentage of missing values per column
(full_df.isnull().sum()/full_df.shape[0] * 100).sort_values(ascending= False)

In [None]:
# Analysis of columns with more than 90% missing values
missing_cols= df.columns[full_df.isnull().sum()/full_df.shape[0] > 0.90]
missing_cols

### Filling useful columns, deleting the useless ones


In [None]:
# We fillna() the tax column withe the tax(£) columns
df["tax"]= df["tax"].fillna(df["tax(£)"])

# We then drop it
df.drop("tax(£)", inplace=True, axis='columns')

In [None]:
df.isnull().sum()/df.shape[0] *100

In [None]:
df.shape

### Drop NaN values

In [None]:
df.isnull().sum()/df.shape[0]

In [None]:
# We can drop the Nan values as they represent only 8% of the datframe
print(df.shape)
df= df.dropna()
print(df.shape)


## Numerical Values

In [None]:
df.dtypes

In [None]:
# List of numerical columns
df_numerical= [col for col in df.columns if df[col].dtype != 'object']
df_numerical

In [None]:
"""Distribution of numerical values"""

fig, ax = plt.subplots(3, 2, 
                       figsize=(10, 14))
col= 1
for i in df[df_numerical].columns:
    plt.subplot(4, 2, col)
    sns.distplot(df[i], color='blue')
    col=col+1
    plt.xlabel(i, fontsize=12)
    plt.legend()
plt.show()

### Target: price
- Our target 'price' is an object column, we will transform it to int
- **Distribution study**:
  - *Positive skewness factor*: verified by mean > median > mode
    -  We therefore know that we have some outliers with high weight, we keep it in mind just in case
    

In [None]:
# We will transform ou data frame to take values only where price is not null
df= df.dropna(subset=['price'])
df.shape

In [None]:
# We convert price column 
price_col= df.price.astype(str)
price_col= price_col.str.replace('[£\,]','').astype(float)

In [None]:
# Target variable distribution
sns.distplot(price_col)

In [None]:
# Stastical informations abour target
print(f"Median: {price_col.median()}\nMean: {price_col.mean()}\nMode: {price_col.mode()}")

In [None]:
# Clearer view of our target
sns.boxplot(price_col)

In [None]:
# finally we replace the column in our dataset
df.price= price_col

In [None]:
# Check of the type once again
df.dtypes

- Automatic and semi-auto worth more money than manual transmission

In [None]:
# Price by transmission type
sns.barplot(x = df["transmission"], y = df["price"])

- Mercedes/audi/BMW are the manufacturer which worth the most money

In [None]:
# Price by manufacturer
sns.barplot(x = df["Manufacturer"], y = df["price"])

- The most recent cars worth the most money
- We can also see that cars old enought to be considered as **collection cars (here 1970) worth also good money**

In [None]:
# Price by year
plt.figure(figsize=(15,5),facecolor='w') 
sns.barplot(x = df["year"], y = df["price"])

### Mileage
- No NaN values
- Obvious negative corrrelation between price and mileage
  - will show a correlation heatmap later in the study


In [None]:
df.mileage.isnull().sum()

In [None]:
sns.pairplot(df[['mileage', 'price']])

### Engine Size



In [None]:
df.engineSize.isnull().sum()

In [None]:
sns.distplot(df.engineSize)

### Year
- No NaN values
- Value with year =2060
  - Delete this row

In [None]:
df.year.isnull().sum()

In [None]:
sns.boxplot(df.year)

In [None]:
# Value at 2060 outlier
df.year.sort_values()

In [None]:
# Deleting this row
df= df[df.year<=2021]

In [None]:
df.year.max()

## Categorical values


#### Models
- No NaN values in this column
- We can observe the most present cars model for each manufacturer
- The most present model in the dataset

In [None]:
# null values
df.model.isnull().sum()

In [None]:
# Number of models
len(df.model.unique())

In [None]:
# Population of each model for a given manufacturer
plt.figure(figsize=(7,11))
df["model"].hist(by= df.Manufacturer, figsize= (15,11))

In [None]:
# Most represented model in dataset
df.model.value_counts(normalize= True)*100

#### Transmission
- No NaN values
- Distribution of each kind of transmission
- Most of the transmissions are manual


In [None]:
df.transmission.isnull().sum()

In [None]:
df.transmission.value_counts(normalize=True)*100

In [None]:
sns.countplot(df.transmission)

#### Fuel type
- No NaN values
- Different kind of fuel type in the dataset
- Most present fuel type is Petrol

In [None]:
df["fuelType"].isnull().sum()

In [None]:
plt.figure(figsize=(11,7))
plt.xticks(rotation=90)
sns.countplot(df['fuelType'])

In [None]:
df.columns

## Correlations
- Important correlations:
  - year/mileage: -0.74
  - year/price: +0.49
  - price/engineSize: +0.64
  - tax/mpg: -0.45
  - price/mileage: -0.42

In [None]:
# Correlation heatmap
sns.heatmap(df.corr(),
            annot= True,
            square= True,
            linewidth=1, linecolor='w')

In [None]:
# Pair plots
sns.pairplot(df.sample(frac= 0.8))

## Features creation
- We will create:
  - "Country" feature based on the manufacturer's country
  - "Age" feature based on the age of the car, easily computable
- We will remove the year column to avoid correlation

In [None]:
# Country column
map_country={
    'bmw':'germany', 
    'mercedes':'germany', 
    'audi':'germany', 
    'vauxhall': 'USA', 
    'ford': 'USA', 
    'toyota':'japan', 
    'hyundai':'south_korea',
    'Volkswagen':'germany', 
    'skoda': 'czech'
}

df["country"]= df["Manufacturer"].map(map_country)
df[["Manufacturer", "country"]].head(3)

In [None]:
# Age column
df["age"]= abs(df["year"]-2021)
df[["year", "age"]].head(3)

In [None]:
# Droping the year column
df.drop("year", axis=1, inplace=True)
df.columns

# Data pre-processing

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [None]:
# Categorical columns
df_categorical = [col for col in df.columns if df[col].dtype =='object']
print(df_categorical)

# Numerical columns
df_numerical = [col for col in df.columns if df[col].dtype !='object']
print(df_numerical)

## One Hot encoder
- In order to have only numerical values, we need to encode our categorical data
- We will choose pandas.get_dummies over OHE from sklearn since it will keep column names more recognizable

In [None]:
# One Hot encoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse= False, drop='first')
test = ohe.fit_transform(df[df_categorical])
test

In [None]:
# Pandas dummies
df_expended= pd.get_dummies(df)
df_expended.shape

In [None]:
df_expended.head()

## Standardisation
- We will then standardize all the variables in the data set
- The standard score of a sample x is calculated as:

```
    z = (x - u) / s
```

  - u --> is the mean of the training samples or zero if with_mean=False
  - s --> is the standard deviation of the training samples or one if with_std=False.

In [None]:
# Standard scaler whithout standardisation
from sklearn.preprocessing import RobustScaler
stder= StandardScaler(with_std= False)

df_expended_std = stder.fit_transform(df_expended)
df_expended_std = pd.DataFrame(df_expended_std, columns = df_expended.columns)
print(df_expended_std.shape)
df_expended_std.head()

## Splitting sets

In [None]:
# Features
X= df_expended_std.drop("price", axis=1)

# Target
y= df_expended_std["price"]

print(f"X: {X.shape}\ny: {y.shape}")

In [None]:
# We creat our test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=42)

In [None]:
size={
    "x_test": X_test.shape,
    "x_train": X_train.shape,
    "y_test": y_test.shape,
    "y_train": y_train.shape
}
print(size)

# **Model**

- For this regression study, we need to predic the price of a car based on all the different given parameters, in order to do so we will use a linear regression model.
- We have:
  - Feature engineered (kind of) by creating two variables, age and country of the manufacturer.
  - Pre processed our data: One Hot Encoder, Standardisation
- We will:
  - Select the best features with SelectKBest: Trying multiple sets of features and choose a suitable number of features.

## Selecting best features
- We have 223 features after the OHE with pd.get_dummies(), **I will therefore use the SelectKbest()** from sklearn in order to select the best features to apply regression.
  - **SelectKBest** will do for us an univariate feature selection with a scoring based on f_regression. It will select the k best features based of the scoring of each one against the data set
- I will **select from 4 to 223 features (k) on f_regression** in order to see the most revelant features in the dataset.
- **We will select 158 features**

In [None]:
column_names = df_expended_std.drop('price', axis= 1).columns

no_of_features = []
r_squared_train = []
r_squared_test = []

# We iterate over a range of 4, 223 for the number of best features
for k in range(4, 224, 2):
    selector = SelectKBest(f_regression, 
                           k = k)
    
    # Our transformed sets with the k-best features
    X_train_transformed = selector.fit_transform(X_train, y_train)
    X_test_transformed = selector.transform(X_test)

    # We train a basic regression model on those transformed sets
    regressor = LinearRegression()
    regressor.fit(X_train_transformed, y_train)


    no_of_features.append(k)
    r_squared_train.append(regressor.score(X_train_transformed, y_train))
    r_squared_test.append(regressor.score(X_test_transformed, y_test))
    
sns.lineplot(x = no_of_features, y = r_squared_train, legend = 'full')
sns.lineplot(x = no_of_features, y = r_squared_test, legend = 'full')

In [None]:
# We can see that the curve stabilizes around ~160 variables
# We will inspect more closely

max_test_score= max(r_squared_test)
index_max= r_squared_test.index(max_test_score)
print(f"Best score is obtained for {no_of_features[index_max]} features --> score: {max_test_score}") 

- 158 features seems a good choice as it's shows the first score of at least 0.85

In [None]:
# We will see with less variables
for n_features in range(50, 224,2):
  index_reasonable= no_of_features.index(n_features)
  score= r_squared_test[index_reasonable]
  print("--",n_features, score)

In [None]:
sns.lineplot(x = no_of_features[20:], y = r_squared_train[20:], legend = 'full')
sns.lineplot(x = no_of_features[20:], y = r_squared_test[20:], legend = 'full')

In [None]:
# We selected 158 columns, let see them
selector = SelectKBest(f_regression, k = 158)

# Transformed sets
X_train_transformed = selector.fit_transform(X_train, y_train)
X_test_transformed = selector.transform(X_test)

# Names of best features
kbest_features = list(column_names[selector.get_support()])

## Trying different models
- We will choose the DecisionTreeRegressor as it gives us a R^2 of 0.92

In [None]:
# Function to try our different models later
def test_regressor_model(models_list, X_train_transformed, X_test_transformed, y_train, y_test):
  """
  - models_list: list of tuple, 
    - tuple[0] = model to test, 
    - tuple[1] = model_name
  """
  for model in models_list:
    model[0].fit(X_train_transformed, y_train)

    y_pred= model[0].predict(X_test_transformed)
    score= model[0].score(X_test_transformed, y_test)
    print(f"{model[1]:-<50}{score}")


In [None]:
# Creation of the list of models we want to test
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, ElasticNet, Lasso

models_list= [(DecisionTreeRegressor(), "DecisionTreeRegressor"),
              (Ridge(), "Ridge regression"),
              (ElasticNet(), "Elastic Net"),
              (Lasso(), "Lasso")]

# Scoring              
test_regressor_model(models_list, X_train_transformed, X_test_transformed, y_train, y_test)

# Model optimisation
- We will try to optimise as much as possible our selected model

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params= {'min_samples_split': range(2, 15)}
grid= GridSearchCV(DecisionTreeRegressor(), params, n_jobs=15)

grid.fit(X_train_transformed, y_train)

In [None]:
print(grid.best_params_)

In [None]:
from sklearn.metrics import r2_score

y_pred= grid.predict(X_test_transformed)

r2_score(y_test, y_pred)