# Task for Today  

***

## Used Car Engine Type Prediction  

Given *data about used cars*, let's try to predict the **engine type** of a given car.  
  
We will use a TensorFlow ANN to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
data = pd.read_csv('../input/usedcarscatalog/cars.csv')

In [None]:
data

In [None]:
data.info()

# Filling Missing Values

In [None]:
print("Total missing values:", data.isna().sum().sum())

In [None]:
print("Columns with missing values:", data.columns[data.isna().sum() > 0].values)

In [None]:
data['engine_capacity'].dtype

In [None]:
data['engine_capacity'] = data['engine_capacity'].fillna(data['engine_capacity'].mean())

print("Total missing values:", data.isna().sum().sum())

# Encoding Features

In [None]:
data

First let's just make all the boolean columns into integer columns.

In [None]:
for column in data.columns:
    if data.dtypes[column] == 'bool':
        data[column] = data[column].astype(np.int)

Then we will take a look at the number of unique values in each categorical column.

In [None]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

*model_name* has too many unique values, so let's drop that feature.

In [None]:
data = data.drop('model_name', axis=1)

*transmission* has only two unique values, so we can encode it as a binary feature.

In [None]:
data['transmission'].unique()

In [None]:
transmission_mapping = {'automatic': 0, 'mechanical': 1}

data['transmission'] = data['transmission'].replace(transmission_mapping)

All the remaining features can be one-hot encoded.  
We will not encode *engine_type*, as that is our label column.

In [None]:
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [None]:
onehot_columns = [
    'manufacturer_name',
    'color',
    'engine_fuel',
    'body_type',
    'state',
    'drivetrain',
    'location_region'
]

onehot_prefixes = [
    'm',
    'c',
    'e',
    'b',
    's',
    'd',
    'l'
]

data = onehot_encode(
    data,
    columns=onehot_columns,
    prefixes=onehot_prefixes
)

In [None]:
print("Remaining non-numeric columns:", (data.dtypes == 'object').sum())

# Encoding Labels

In [None]:
data['engine_type'].unique()

In [None]:
label_mapping = {
    'gasoline': 0,
    'diesel': 1,
    'electric': 2
}

data['engine_type'] = data['engine_type'].replace(label_mapping)

In [None]:
print("Remaining non-numeric columns:", (data.dtypes == 'object').sum())

# Visualizing Correlations

In [None]:
corr = data.loc[:, 'transmission':'duration_listed'].corr()

plt.figure(figsize=(18, 15))
sns.heatmap(corr, annot=True, vmin=-1.0, cmap='rocket')
plt.show()

# Splitting/Scaling

In [None]:
y = data['engine_type'].copy()
X = data.drop('engine_type', axis=1).copy()

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

# Modeling/Training

In [None]:
X.shape

In [None]:
inputs = tf.keras.Input(shape=(118,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(3, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)


model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

# Results

In [None]:
model.evaluate(X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/55PIi9F9r-4