# Finding Outliers with k-means

## Setup

In [1]:
import numpy as np
import pandas as pd

import sqlite3

with sqlite3.connect('../../ch_11/logs/logs.db') as conn:
    logs_2018 = pd.read_sql(
        """
        SELECT * 
        FROM logs 
        WHERE datetime BETWEEN "2018-01-01" AND "2019-01-01";
        """, 
        conn, parse_dates=['datetime'], index_col='datetime'
    )
logs_2018.head()

Unnamed: 0_level_0,source_ip,username,success,failure_reason
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:05:32.988414,223.178.55.3,djones,1,
2018-01-01 00:08:00.343636,223.178.55.3,djones,0,error_wrong_password
2018-01-01 00:08:01.343636,223.178.55.3,djones,1,
2018-01-01 01:06:59.640823,208.101.11.88,wbrown,1,
2018-01-01 02:40:47.769630,11.76.99.35,tkim,1,


The `get_X()` function from the chapter:

In [2]:
def get_X(log, day):
    """
    Get data we can use for the X
    
    Parameters:
        - log: The logs dataframe
        - day: A day or single value we can use as a datetime index slice
    
    Returns: 
        A `pandas.DataFrame` object
    """
    return pd.get_dummies(log.loc[day].assign(
        failures=lambda x: 1 - x.success
    ).query('failures > 0').resample('1min').agg(
        {'username': 'nunique', 'failures': 'sum'}
    ).dropna().rename(
        columns={'username': 'usernames_with_failures'}
    ).assign(
        day_of_week=lambda x: x.index.dayofweek, 
        hour=lambda x: x.index.hour
    ).drop(columns=['failures']), columns=['day_of_week', 'hour'])

Get January 2018 data:

In [3]:
X = get_X(logs_2018, '2018')
X.columns

Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')

## k-eans
Since we want a "normal" activity cluster and an "anomaly" cluster, we need to make 2 clusters.

In [4]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

kmeans_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('kmeans', KMeans(random_state=0, n_clusters=2))
]).fit(X)

The cluster label doesn't mean anything to us, but we can examine the size of each cluster. We don't expect the clusters to be of equal size because anomalous activity doesn't happen as often as normal activity (we presume).

In [5]:
preds = kmeans_pipeline.predict(X)
pd.Series(preds).value_counts()

0    431902
1     93660
dtype: int64

### Evaluating the clustering
#### Step 1: Get the true labels

In [6]:
with sqlite3.connect('../../ch_11/logs/logs.db') as conn:
    hackers_2018 = pd.read_sql(
        'SELECT * FROM attacks WHERE start BETWEEN "2018-01-01" AND "2019-01-01";', 
        conn, parse_dates=['start', 'end']
    ).assign(
        duration=lambda x: x.end - x.start, 
        start_floor=lambda x: x.start.dt.floor('min'),
        end_ceil=lambda x: x.end.dt.ceil('min')
    )

The `get_y()` function from the chapter:

In [7]:
def get_y(datetimes, hackers, resolution='1min'):
    """
    Get data we can use for the y (whether or not a hacker attempted a log in during that time).
    
    Parameters:
        - datetimes: The datetimes to check for hackers
        - hackers: The dataframe indicating when the attacks started and stopped
        - resolution: The granularity of the datetime. Default is 1 minute.
        
    Returns:
        `pandas.Series` of Booleans.
    """
    date_ranges = hackers.apply(
        lambda x: pd.date_range(x.start_floor, x.end_ceil, freq=resolution), 
        axis=1
    )
    dates = pd.Series(dtype='object')
    for date_range in date_ranges:
        dates = pd.concat([dates, date_range.to_series()])
    return datetimes.isin(dates)

Get the true labels:

In [8]:
is_hacker = get_y(X.reset_index().datetime, hackers_2018)

### Step 2: Calculate Fowlkes Mallows Score
This indicates percentage of the observations belong to the same cluster in the true labels and in the predicted labels.

In [9]:
from sklearn.metrics import fowlkes_mallows_score

fowlkes_mallows_score(is_hacker, preds)

0.8403978810256895