In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Finding Outliers in the EV dataset
### Battery level and estimated vehcile range
Source: vmarrapu@gmail.com (handwritten based on datasets after analyzing real EV vehicle datasets)
References: https://machinelearningmastery.com/model-based-outlier-detection-and-removal-in-python/
plus Janani Ravi's pluralsight courses. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
batt_range_df = pd.read_csv('../input/ev-battery-level-vs-distance-approximation/ev_battery_vs_distance.csv')
batt_range_df.head(10)

In [None]:
batt_range_df.shape

In [None]:
batt_range_df.columns

In [None]:
X = batt_range_df[['BATT_LEVEL', 'RANGE_EST']]

X.sample(10)

In [None]:
plt.figure(figsize=(12, 10))

plt.scatter(X['BATT_LEVEL'], X['RANGE_EST'], s=100, color='blue')

plt.ylabel("Distance Estd")
plt.xlabel("Battery Level")

plt.show()

In [None]:
from sklearn.covariance import EllipticEnvelope

ee = EllipticEnvelope(support_fraction=None, contamination=0.1)
y_pred = ee.fit_predict(X)

y_pred

In [None]:
n_outliers = (y_pred == -1).sum()
n_outliers

In [None]:
xx, yy = np.meshgrid(np.linspace(X['BATT_LEVEL'].min(), X['BATT_LEVEL'].max(), 500),
                     np.linspace(X['RANGE_EST'].min(), X['RANGE_EST'].max(), 500))

Z = ee.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(12, 10))

plt.title('EllipticEnvelope', size=18)

colors = np.array(['r', 'b'])

plt.scatter(X['BATT_LEVEL'], 
            X['RANGE_EST'], 
            s=100, 
            color=colors[(y_pred + 1) // 2])

plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')

plt.show()

In [None]:
from sklearn.ensemble import IsolationForest

#isf =  IsolationForest(behaviour='new', contamination=0.08)
isf =  IsolationForest( contamination=0.08)
y_pred = isf.fit_predict(X)

y_pred


In [None]:
n_outliers = (y_pred == -1).sum()
n_outliers

In [None]:
xx, yy = np.meshgrid(np.linspace(X['BATT_LEVEL'].min(), X['BATT_LEVEL'].max(), 500),
                     np.linspace(X['RANGE_EST'].min(), X['RANGE_EST'].max(), 500))

Z = isf.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure(figsize=(15, 8))

plt.title('IsolationForest', size=18)

plt.scatter(X['BATT_LEVEL'], 
            X['RANGE_EST'], 
            s=100, 
            color=colors[(y_pred + 1) // 2])


plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')

plt.show()

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=15, contamination='auto')

In [None]:
y_pred = lof.fit_predict(X)

y_pred

In [None]:
n_outliers = (y_pred == -1).sum()
n_outliers

In [None]:
plt.figure(figsize=(12, 10))

plt.title('Local Outlier Factor', size=18)

plt.scatter(X['BATT_LEVEL'], 
            X['RANGE_EST'], 
            s=100, 
            color=colors[(y_pred + 1) // 2])

plt.show()

In [None]:
X_scores = lof.negative_outlier_factor_
X_scores[0:10]

In [None]:
plt.figure(figsize=(15, 8))

radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())

plt.scatter(X['BATT_LEVEL'], 
            X['RANGE_EST'], 
            s=1000 * radius, edgecolors='r',
            facecolors='none', 
            label='Outlier scores')

plt.show()