In [None]:
import numpy as np
import pandas as pd
import random
import math
import sklearn.datasets as ds
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler

%matplotlib inline

In [None]:
X = pd.read_csv('../data/diamonds.csv', index_col=0)

In [None]:
X.shape

In [None]:
X.head()

In [None]:
X['cut'].value_counts()

In [None]:
X['cut'] = X['cut'].map({'Fair':0, 'Good':1, 'Ideal':3, 'Premium':4, 'Very Good':2})
X = X[X['cut']<2].reset_index(drop=True)
y = X['cut']
X = X._get_numeric_data().dropna(axis=1).drop('cut', axis=1)

In [None]:
ms = MinMaxScaler()
ms.fit(X.astype(float))
X_ms = ms.transform(X.astype(float))

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=17, verbose=1)

res = tsne.fit_transform(X_ms)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res[:,0], res[:,1], c=y, s=10, cmap='Set1')
plt.title('TSNE')

In [None]:
sns.pairplot(pd.DataFrame(X))

## Статистические тесты

In [None]:
def mad_based_outlier(data, thresh=3.5):
    if len(data.shape) == 1:
        data = data[:,None]
    median = np.median(data, axis=0)
    diff = np.sum((data - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

def percentile_based_outlier(data, threshold=99):
    diff = (100 - threshold) / 2.0
    minval, maxval = np.percentile(data, [diff, 100 - diff])
    return (data < minval) | (data > maxval)

def z_score_outlier(data, threshold=3):
    data = (data - data.mean()) / data.std() 
    return (data < -threshold) | (data > threshold)
    
def plot(x):
    fig, axes = plt.subplots(nrows=3, figsize=(10, 10))
    for ax, func in zip(axes, [percentile_based_outlier, mad_based_outlier, z_score_outlier]):
        sns.distplot(x, ax=ax, rug=True, hist=False)
        outliers = x[func(x)]
        ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)

    kwargs = dict(y=0.95, x=0.05, ha='left', va='top')
    axes[0].set_title('Percentile-based Outliers', **kwargs)
    axes[1].set_title('MAD-based Outliers', **kwargs)
    axes[2].set_title('Z_score-based Outliers', **kwargs)
    fig.suptitle('Comparing Outlier Tests with n={}'.format(len(x)), size=14)

np.random.seed(42)
for num in [10, 50, 100, 1000, 7000]:
        # Создадим данные
        x = np.random.normal(0, 0.5, num-3)

        # Добавим аутлайеров
        x = np.r_[x, -3, -10, 12]
        plot(x)

plt.show()

In [None]:
ss = StandardScaler()
ss.fit(X.astype(float))
X_transform = ss.transform(X.astype(float))

In [None]:
outlier_indices = (X_transform < -3).astype(int).sum(axis=1) + \
(X_transform > 3).astype(int).sum(axis=1) > 0 
y_pred = outlier_indices.astype(int)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res[:,0], res[:,1], c=y_pred, s=10, cmap='Set1')
plt.title('TSNE')

In [None]:
X_transform = X.copy()
X_transform['y_pred'] = y_pred
X_transform = X_transform[X_transform['y_pred']<1].reset_index(drop=True).drop('y_pred', axis=1)

X.shape, X_transform.shape

In [None]:
sns.pairplot(pd.DataFrame(X_transform))

In [None]:
X_transform = X.copy()
for col in X_transform:
    diff = (100 - 99) / 2.0
    minval, maxval = np.percentile(X_transform[col], [diff, 100 - diff])
    threshold = (maxval-minval)/2
    print(threshold)
    X_transform[col] = mad_based_outlier(X_transform[col], thresh=threshold).astype(int)

In [None]:
outlier_indices = (X_transform).sum(axis=1) > 0 
y_pred = outlier_indices.astype(int)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res[:,0], res[:,1], c=y_pred, s=10, cmap='Set1')
plt.title('TSNE')

In [None]:
X_transform = X.copy()
X_transform['y_pred'] = y_pred
X_transform = X_transform[X_transform['y_pred']<1].reset_index(drop=True).drop('y_pred', axis=1)

X.shape, X_transform.shape

In [None]:
sns.pairplot(pd.DataFrame(X_transform))

In [None]:
X_transform = X.copy()
for col in X_transform:
    sns.distplot(X_transform[col], hist=True)
    plt.show()

# Local One Factor

![](https://upload.wikimedia.org/wikipedia/commons/5/59/LOF.svg)

In [None]:
from sklearn.neighbors import LocalOutlierFactor

ss = StandardScaler()
ss.fit(X.astype(float))
X_transform = ss.transform(X.astype(float))

clf = LocalOutlierFactor(n_neighbors=5, contamination='auto')
y_pred = clf.fit_predict(X_transform) 
set(y_pred)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res[:,0], res[:,1], c=y_pred, s=10, cmap='Set1')
plt.title('TSNE')

In [None]:
X_transform = X.copy()
X_transform['y_pred'] = y_pred
X_transform = X_transform[X_transform['y_pred']>0].reset_index(drop=True).drop('y_pred', axis=1)

X.shape, X_transform.shape

In [None]:
sns.pairplot(pd.DataFrame(X_transform))

## OneClassSVM

![](https://scikit-learn.org/stable/_images/sphx_glr_plot_anomaly_comparison_001.png)

In [None]:
from sklearn.svm import OneClassSVM

ss = StandardScaler()
ss.fit(X.astype(float))
X_transform = ss.transform(X.astype(float))

clf = OneClassSVM(kernel='linear', nu=0.97) #'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
y_pred = clf.fit_predict(X_transform) 
set(y_pred)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res[:,0], res[:,1], c=y_pred, s=10, cmap='Set1')
plt.title('TSNE')

In [None]:
X_transform = X.copy()
X_transform['y_pred'] = y_pred
X_transform = X_transform[X_transform['y_pred']<0].reset_index(drop=True).drop('y_pred', axis=1)

X.shape, X_transform.shape

In [None]:
sns.pairplot(pd.DataFrame(X_transform))

## IsolationForest

In [None]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(contamination='auto', behaviour='new', 
                      random_state=42, bootstrap=False, max_features=0.6, 
                      n_jobs=-1, n_estimators=1000)
y_pred = clf.fit_predict(X) 
set(y_pred)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res[:,0], res[:,1], c=y_pred, s=10, cmap='Set1')
plt.title('TSNE')

In [None]:
X_transform = X.copy()
X_transform['y_pred'] = y_pred
X_transform = X_transform[X_transform['y_pred']>0].reset_index(drop=True).drop('y_pred', axis=1)

X.shape, X_transform.shape

In [None]:
sns.pairplot(pd.DataFrame(X_transform))

## EllipticEnvelope

In [None]:
from sklearn.covariance import EllipticEnvelope

ss = StandardScaler()
ss.fit(X.astype(float))
X_transform = ss.transform(X.astype(float))

clf = EllipticEnvelope(random_state=42, contamination=0.05) 
y_pred = clf.fit_predict(X_transform) 
set(y_pred)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res[:,0], res[:,1], c=y_pred, s=10, cmap='Set1')
plt.title('TSNE')

In [None]:
X_transform = X.copy()
X_transform['y_pred'] = y_pred
X_transform = X_transform[X_transform['y_pred']>0].reset_index(drop=True).drop('y_pred', axis=1)

X.shape, X_transform.shape

In [None]:
sns.pairplot(pd.DataFrame(X_transform))

# В качестве классификатора

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
res_train, res_test, X_train, X_test, y_train, y_test = train_test_split(res, X, y, test_size=0.3)

In [None]:
set(y_train)

In [None]:
ss = StandardScaler()
ss.fit(X_train.astype(float))
X_train = ss.transform(X_train.astype(float))
X_test = ss.transform(X_test.astype(float))

In [None]:
clf = EllipticEnvelope(random_state=42)
clf.fit(X_train[y_train == 0]) 
y_pred = clf.predict(X_test) 

accuracy_score(y_test, y_pred)


In [None]:
plt.figure(figsize=(12,8))
plt.scatter(res_test[:,0], res_test[:,1], c=y_pred, s=10, cmap='Set1')
plt.title('TSNE')