# Unsupervised anomaly detection with One-Class SVM

## Setup

In [1]:
import numpy as np
import pandas as pd

import sqlite3

with sqlite3.connect('../../ch_11/logs/logs.db') as conn:
    logs_2018 = pd.read_sql(
        """
        SELECT * 
        FROM logs 
        WHERE datetime BETWEEN "2018-01-01" AND "2019-01-01";
        """, 
        conn, parse_dates=['datetime'], index_col='datetime'
    )
logs_2018.head()

Unnamed: 0_level_0,source_ip,username,success,failure_reason
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:06:19.353126,223.178.55.3,djones,1,
2018-01-01 00:09:07.147971,223.178.55.3,djones,1,
2018-01-01 01:08:08.610041,6.252.142.27,asmith,1,
2018-01-01 02:37:50.329298,124.178.25.98,akim,1,
2018-01-01 02:45:20.382080,98.43.141.103,akim,1,


In [2]:
def get_X(log, day):
    """
    Get data we can use for the X
    
    Parameters:
        - log: The logs dataframe
        - day: A day or single value we can use as a datetime index slice
    
    Returns: 
        A pandas DataFrame
    """
    return pd.get_dummies(log[day].assign(
        failures=lambda x:  1 - x.success
    ).query('failures > 0').resample('1min').agg(
        {'username':'nunique', 'failures': 'sum'}
    ).dropna().rename(
        columns={'username':'usernames_with_failures'}
    ).assign(
        day_of_week=lambda x: x.index.dayofweek, 
        hour=lambda x: x.index.hour
    ).drop(columns=['failures']), columns=['day_of_week', 'hour'])

In [3]:
X = get_X(logs_2018, '2018-01')
X.columns

Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')

## One-class SVM

In [4]:
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

one_class_svm_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('svm', OneClassSVM(random_state=0))
]).fit(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [5]:
preds = one_class_svm_pipeline.predict(X)
pd.Series(np.where(preds == -1, 'outlier', 'inlier')).value_counts()

  Xt = transform.transform(Xt)


outlier    22823
inlier     18794
dtype: int64