# Quickest Electric Cars EDA

## Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor

import graphviz

In [None]:
cars = pd.read_csv('../input/quickest-electric-cars-ev-database/Quickestelectriccars-EVDatabase.csv')
cars

## Data Preprocessing

In [None]:
# Check null values by columns
cars.isnull().sum()

In [None]:
# Remove rows which has nulll values
cars.dropna(axis=0, inplace=True)
cars

In [None]:
# Remove rows which is '-'
idx = cars[cars['FastChargeSpeed'] == '-'].index
cars = cars.drop(idx)
cars

In [None]:
# Reset index
cars.reset_index(drop=True, inplace=True)
cars

In [None]:
# Create 'Compnay' Column
cars['Company'] = cars['Name'].str.split(' ').str[0]
cars

In [None]:
# Remove units of each column
cars['BatteryCapacity[kWh]'] = cars['Subtitle'].str.extract(r'(\d+[.\d]*)').astype(float)
cars['Acceleration[sec]'] = cars['Acceleration'].str.extract(r'(\d+[.\d]*)').astype(float)
cars['TopSpeed[km/h]'] = cars['TopSpeed'].str.extract(r'(\d+)').astype(int)
cars['Range[km]'] = cars['Range'].str.extract(r'(\d+)').astype(int)
cars['Efficiency[Wh/km]'] = cars['Efficiency'].str.extract(r'(\d+)').astype(int)
cars['FastChargeSpeed[km/h]'] = cars['FastChargeSpeed'].str.extract(r'(\d+)').astype(int)
cars['Drive_Class'] = cars['Drive'].str.split(' ').str[0]
cars['PriceinGermany[€]'] = cars['PriceinGermany'].str.replace(pat=r'[^\w]', repl=r'', regex=True).astype(int)
cars['PriceinUK[£]'] = cars['PriceinUK'].str.replace(pat=r'[^\w]', repl=r'', regex=True).astype(int)

cars

In [None]:
# Remove old ones
cars.drop(['Subtitle', 'Acceleration', 'TopSpeed', 'Range', 'Efficiency', 'FastChargeSpeed', 'Drive', 'NumberofSeats', 'PriceinGermany', 'PriceinUK'], axis=1, inplace=True)
cars

## Visualization

### Heatmap

#### Heatmap across the DataFrame

In [None]:
# Create groupby processed DataFrame
# We can visualize between all features and companies
cars_groupby = cars.groupby('Company').mean()
cars_groupby

In [None]:
# Normalization
# Unit of each columns is differenct each other.
# Therfore, with Normalization, we can compare columns with same level
def mean_norm(df_input):
    return df_input.apply(lambda x: (x - x.mean()) / x.std(), axis=0)

cars_norm = mean_norm(cars_groupby)
cars_norm

In [None]:
# Heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(data=cars_norm, linewidths=.5, cmap="YlGnBu")
plt.title('Feature - Companies')
plt.show()

#### Heatmap Drive_Class - Companies

In [None]:
# Function which creates Pivot Table
# We can choose Feature which we want to select
def cars_pivot(data, index, columns, values):
    global cars_pt
    cars_pt = pd.pivot_table(data=data, index=index, columns=columns, values=values, aggfunc='mean', fill_value=0)

cars_pivot(cars, 'Company', 'Drive_Class', 'BatteryCapacity[kWh]')

In [None]:
cars_pt

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(data=cars_pt, annot=True, annot_kws={"size":10}, linewidths=.5, cmap="YlGnBu", fmt="")
plt.title('Drive_Class - Companies')
plt.show()

### Bar Plot

#### Battery Capacity

In [None]:
cars_BatteryCapacity = cars.sort_values(by='BatteryCapacity[kWh]', ascending=False)
cars_BatteryCapacity

In [None]:
# Single Plot

sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(18, 18))
sns.barplot(x='BatteryCapacity[kWh]', y='Company', data=cars_BatteryCapacity, color='b')
plt.title('Companies - BatteryCapacity[kWh]', fontdict={'size': 20})

plt.show()

#### Range

In [None]:
cars_Range = cars.sort_values(by='Range[km]', ascending=False)
cars_Range

In [None]:
# Single Plot

sns.set_theme(style='whitegrid')
fig, ax = plt.subplots(figsize=(18, 18))
sns.barplot(x='Range[km]', y='Company', data=cars_Range, color='b')
plt.title('Companies - Range[km]', fontdict={'size': 20})

plt.show()

#### Battery Capacity & Range

In [None]:
# All features are important
# But most of all, Battery Capacity and Range is more considered when we choose electric cars
# Multiplots
cars_norm_sorted = cars_norm.sort_values(by=['BatteryCapacity[kWh]', 'Range[km]'], ascending=False)
cars_norm_sorted.reset_index(inplace=True)
cars_norm_sorted

In [None]:
# Batplots with multi index: BatteryCapacity[kWh], Range[km]

sns.set_theme(style='whitegrid')

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Plot the BatteryCapacity[kWh]
sns.set_color_codes("pastel")
sns.barplot(x='BatteryCapacity[kWh]', y='Company', data=cars_norm_sorted, label='BatteryCapacity[kWh]', color='b')

# Plot the Range[km]
sns.set_color_codes("muted")
sns.barplot(x='Range[km]', y='Company', data=cars_norm_sorted, label='Range[km]', color='b')

# Add a legend and informative axis label
ax.legend(ncol=2, loc='lower right', frameon=True)
ax.set(xlim=(-2, 3), xlabel='Features', ylabel='Company', title='Feature by Companies')
sns.despine(left=True, bottom=True)

plt.show()

#### Price

In [None]:
# Price Compairson between Germany and UK

f, ax = plt.subplots(figsize=(15, 15))
sns.distplot(cars_norm_sorted['PriceinGermany[€]'], hist=True, label='PriceinGermany[€]')
sns.distplot(cars_norm_sorted['PriceinUK[£]'], hist=True, label='PriceinUK[£]')

ax.legend(ncol=2, loc='lower right', frameon=True)
ax.set(xlim=(-3, 5), xlabel='Company', ylabel='Price', title='Price Compairson between Germany and UK')
sns.despine(left=True, bottom=True)

### Summary
1. Generally, the performance of German manufacturers is very poweful.
2. Most of manufacturers has chosen for All Wheel Drive
3. Prices between Germany and U.K. is similar by Normalized Data

## Preidiction

### Classification

#### Dataset

In [None]:
# Reload for the data
cars

In [None]:
# For training precisely, we should process Normalization of data
# Use following function: 'mean_norm'
# Extract feature columns and process Normalization
cars_before_norm = cars[['BatteryCapacity[kWh]', 'Acceleration[sec]', 'TopSpeed[km/h]', 'Range[km]', 'Efficiency[Wh/km]', 'FastChargeSpeed[km/h]']]
cars_after_norm = mean_norm(cars_before_norm)
cars_after_norm

In [None]:
# Concatenate and check the basic data for training
cars_data = pd.concat([cars[['Name', 'Company']], cars_after_norm], axis=1)
cars_data

In [None]:
# ValueError: Unknown label type: 'continuous'
# We should convert 'label' from float to int in order to avoid above error
cars_data = cars_data.astype({'Range[km]': 'int'})

In [None]:
# Define 'Feature' & 'Label' for Supervised Learning
X = cars_data[['BatteryCapacity[kWh]', 'Acceleration[sec]', 'TopSpeed[km/h]', 'Efficiency[Wh/km]', 'FastChargeSpeed[km/h]']]
y = cars_data['Range[km]']
X, y

In [None]:
# Check shape of features and labels
X.shape, y.shape

In [None]:
# Create datasets for training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Check shape of train dataset and test datatset
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Predict

In [None]:
# DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()

In [None]:
# fit and accuracy score
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
print("accuracy score: ", accuracy_score(y_test, pred))

#### Cross Value Score

In [None]:
# cross_val_score
score = cross_val_score(dt_clf, X, y, scoring="accuracy", cv=3)
print('accuracy per cv:', np.round(score, 4))
print('average of accuracies:', np.round(np.mean(score), 4))

### Regression

In [None]:
# RandomForestRegression
rf = RandomForestRegressor(random_state=42, n_estimators=1000)

In [None]:
neg_mse_scores = cross_val_score(rf, X, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

In [None]:
print('Individual Negative MSE scores per 5 cross value scores: ', np.round(neg_mse_scores, 2))
print('Individual RMSE scores per 5 cross value scores: ', np.round(rmse_scores, 2))
print('Average RMSE scores per 5 cross value scores: {0:.3f}'.format(avg_rmse))