# 4 Predictive Analytic tasks: Anomaly
Perform anomaly detection for requests per department per day in the first 6 months


In [None]:
import sqldf
import pandas as pd
import geopandas as gpd
import sklearn as sk
import helper.functions as hf
from datetime import datetime
import holidays

In [None]:
# make logger
start = datetime.now()
logger = hf.make_logger('4-predictive_analytic_tasks_anomaly')

In [None]:
# load data
col_types = {
    'notification_number':str,
    'reference_number':str
    }
date_cols = ['creation_timestamp','completion_timestamp']
df = pd.read_csv('data/raw/sr_hex.csv', parse_dates=date_cols,dtype=col_types)
df['date'] = pd.to_datetime(df['creation_timestamp'].dt.date, format='%Y-%m-%d')

In [None]:
query = '''
select department,
       date,
       count(distinct notification_number) req
from df
group by department, date
having date < '2020-07-01' and department is not null
order by department, date
'''

df_test = sqldf.run(query)

In [None]:
query = '''
select department,
       date,
       count(distinct notification_number) req
from df
group by department, date
having date >= '2020-07-01' and department is not null
order by department, date
'''

df_train = sqldf.run(query)

In [None]:
from sklearn.ensemble import IsolationForest as IF

res = []

g = df_train.groupby(['department'])
for name, data in g:
    if data.shape[0] > 5:
        x_train = data[['req']]
        clf = IF(n_estimators=100, max_samples='auto', contamination=float(.12), max_features=1.0, bootstrap=False, n_jobs=-1)
        clf.fit(x_train)

        x_test = df_test[df_test['department'] == name]
        if x_test.shape[0] > 0:
            x_dates = x_test[['date']]
            x_test = x_test[['req']]
            pred = pd.DataFrame({'department':name,
                                'date':x_dates['date'],
                                'pred':clf.predict(x_test),
                                'decision_function':clf.decision_function(x_test),
                                'score_samples':clf.score_samples(x_test),
                                'req':x_test['req']})
            pred['request_cnt'] = x_test['req'].sum()
            res.append(pred)


In [None]:
res = pd.concat(res)

# Let's see who had the most anomalies by department


In [None]:
query = '''
select department,
       case when pred = -1 then 1.0 else 0.0 end anom_cnt
from res
'''

sum_df = sqldf.run(query)

query = '''
select department,
       sum(anom_cnt) anom_cnt,
       count(*) cnt,
       sum(anom_cnt)/count(*)*100 anom_per
from sum_df
group by department
'''

sum_df = sqldf.run(query)
sum_df

In [None]:
# We are looking for days which had really high service requests. 
query = '''
select *
from res
where pred = -1 and req > 2
'''

anom_df = sqldf.run(query)

In [None]:
query = '''
select df.*
from df
inner join anom_df on anom_df.department = df.department and anom_df.date = df.date
'''

anom_line = sqldf.run(query)

In [None]:
# Let's see why these days were picked up as anomalies
query = '''
select department,
       cause_code,
       count(distinct notification_number) cnt
from anom_line
group by department, cause_code
'''

sum_table = sqldf.run(query)
sum_table['cause_code'] = sum_table['cause_code'].fillna('unknown')

In [None]:
import plotly.express as px

fig = px.bar(sum_table, x="department", y="cnt", color="cause_code", title="Causes for service: anomalous")
fig.show()

In [None]:
causes = sum_table.groupby('cause_code').agg({'cnt':'sum'}).sort_values('cnt',ascending=False).reset_index()
#causes.plot.bar(x='cause_code',y='cnt',figsize=(15,5))