# 4 Predictive Analytic tasks: Anomaly
Perform anomaly detection for requests per department per day in the first 6 months


In [None]:
import sqldf
import pandas as pd
import geopandas as gpd
import sklearn as sk
import helper.functions as hf
from datetime import datetime
import holidays

In [None]:
# make logger
start = datetime.now()
logger = hf.make_logger('4-predictive_analytic_tasks_anomaly')

In [None]:
# load data
col_types = {
    'notification_number':str,
    'reference_number':str
    }
date_cols = ['creation_timestamp','completion_timestamp']
df = pd.read_csv('data/raw/sr_hex.csv', parse_dates=date_cols,dtype=col_types)
df['date'] = pd.to_datetime(df['creation_timestamp'].dt.date)

In [None]:
query = '''
select department,
       date,
       count(distinct notification_number) req
from df
group by department, date
order by department, date
having date < '2020-07-01'
'''

df_test = sqldf(query)

In [None]:
query = '''
select department,
       date,
       count(distinct notification_number) req
from df
group by department, date
order by department, date
having date >= '2020-07-01'
'''

df_train = sqldf(query)

In [None]:
from sklearn.ensemble import IsolationForest as IF

res = []

g = df_train.groupby(['department'])
for name, data in g:
    x_train = data[['req']]
    clf = IF(max_samples=100)
    clf.fit(x_train)

    x_test = df_test[df_test['department'] == name]
    x_dates = x_test[['date']]
    x_test = x_test[['req']]
    pred = pd.DataFrame({'department':name,
                         'date':x_dates,
                         'pred':clf.predict(x_test),
                         'decision_function':clf.decision_function(x_test),
                         'score_samples':clf.score_samples(x_test)})
    res.append(pred)

In [None]:
res = pd.concat(res,axis=1)