# 4 Predictive Analytic tasks: Anomaly
Perform anomaly detection for requests per department per day in the first 6 months


In [1]:
import sqldf
import pandas as pd
import helper.functions as hf
from datetime import datetime

In [2]:
# make logger
logger = hf.make_logger('4-predictive_analytic_tasks_anomaly')

In [3]:
# load data
col_types = {
    'notification_number':str,
    'reference_number':str
    }
date_cols = ['creation_timestamp','completion_timestamp']
df = pd.read_csv('data/raw/sr_hex.csv', parse_dates=date_cols,dtype=col_types)
df['date'] = pd.to_datetime(df['creation_timestamp'].dt.date, format='%Y-%m-%d')

In [4]:
query = '''
select department,
       date,
       count(distinct notification_number) req
from df
group by department, date
having date < '2020-07-01' and department is not null
order by department, date
'''

df_test = sqldf.run(query)

In [5]:
query = '''
select department,
       date,
       count(distinct notification_number) req
from df
group by department, date
having date >= '2020-07-01' and department is not null
order by department, date
'''

df_train = sqldf.run(query)

In [6]:
from sklearn.ensemble import IsolationForest as IF

res = []

# We'll choose a 5% contamination, but this is a parameter we could play with
g = df_train.groupby(['department'])
for name, data in g:
    if data.shape[0] > 5:
        x_train = data[['req']]
        clf = IF(n_estimators=100, max_samples='auto', contamination=float(.05), max_features=1.0, bootstrap=False, n_jobs=-1)
        clf.fit(x_train.values)

        x_test = df_test[df_test['department'] == name]
        if x_test.shape[0] > 0:
            x_dates = x_test[['date']]
            x_test = x_test[['req']]
            pred = pd.DataFrame({'department':name,
                                'date':x_dates['date'],
                                'pred':clf.predict(x_test.values),
                                'decision_function':clf.decision_function(x_test.values),
                                'score_samples':clf.score_samples(x_test.values),
                                'req':x_test['req']})
            pred['request_cnt'] = x_test['req'].sum()
            res.append(pred)


In [7]:
res = pd.concat(res)

# Let's see who had the most anomalies by department


In [8]:
query = '''
select department,
       case when pred = -1 then 1.0 else 0.0 end anom_cnt
from res
'''

sum_df = sqldf.run(query)

query = '''
select department,
       sum(anom_cnt) anom_cnt,
       count(*) cnt,
       sum(anom_cnt)/count(*)*100 anom_per
from sum_df
group by department
'''

sum_df = sqldf.run(query)
sum_df.sort_values('anom_per')

Unnamed: 0,department,anom_cnt,cnt,anom_per
7,Property Management,3.0,92,3.26087
13,Technical Services,9.0,166,5.421687
11,Social Development & Early Childhood Development,9.0,139,6.47482
4,Electricity Generation and Distribution,13.0,182,7.142857
15,Valuations,9.0,103,8.737864
14,Transport Planning & Network Management,17.0,173,9.82659
3,Distribution Services,19.0,182,10.43956
6,Operational Coordination,15.0,142,10.56338
1,Commercial Services,21.0,182,11.538462
12,Solid Waste Management,22.0,182,12.087912


In [9]:
# We are looking for days which had really high service requests. 
query = '''
select *
from res
where pred = -1 and req > 2
'''

anom_df = sqldf.run(query)

In [10]:
query = '''
select df.*
from df
inner join anom_df on anom_df.department = df.department and anom_df.date = df.date
'''

anom_line = sqldf.run(query)

In [11]:
# Let's see why these days were picked up as anomalies
query = '''
select department,
       cause_code,
       count(distinct notification_number) cnt
from anom_line
group by department, cause_code
'''

sum_table = sqldf.run(query)
sum_table['cause_code'] = sum_table['cause_code'].fillna('unknown')

In [12]:
import plotly.express as px

fig = px.bar(sum_table, x="department", y="cnt", color="cause_code", title="Causes for service: anomalous",
                 width=800, height=800)
fig.show()

# Interpretations
It would appear that the majority of anomalous service requests have unknown (not assigned) causes.
It's hard to make recommendations if the reasons for the request is unclear.

cause_code_group could give insights as to what the service type was.

In [13]:
# investigate cause_group
query = '''
select department,
       cause_code_group,
       count(distinct notification_number) cnt
from anom_line
group by department, cause_code_group
'''
cause_group = sqldf.run(query)

cause_group['cause_code_group'] = cause_group['cause_code_group'].fillna('unknown')

fig = px.bar(cause_group, x="department", y="cnt", color="cause_code_group", title="Cause group for service: anomalous",
                 width=800, height=800)
fig.show()

This also falls in the unknown grouping mostly