In [ ]:
# This cell is NOT editable. Overwrite variables on your own discretion.
# Any changes other than the script code will NOT BE SAVED!
# All cells are assumed to be script code cells, unless explictly tagged as 'o9_ignore'

In [ ]:
_factTable = "Select ([Supplier].[Supplier Location] * [Activity2].[Activity2] * [Location].[Location] * [Documents].[OrderlineID] * [Item].[Item] * [Version].[Version Name] * [Time].[Day] ) on row,  ({Measure.[AL PO % of Total Goods Receipt TG], Measure.[AL PO % of Total Purchase Value TG], Measure.[AL PO % of Total Unique Goods Purchased TG], Measure.[AL PO Commit Creation Date TG], Measure.[AL PO Commit Delivery Date TG], Measure.[AL PO Commit Delivery Qty TG], Measure.[AL PO Goods Receipt Date TG], Measure.[AL PO Goods Receipt Purchase Value TG], Measure.[AL PO Goods Receipt Quantity TG], Measure.[AL PO Header Creation Date TG], Measure.[AL PO Net Price Per Unit TG], Measure.[AL PO Unique Goods Purchased TG]}) on column;"


# Initialize the O9DataLake with the input parameters and dataframes
# Data can be accessed with O9DataLake.get(<Input Name>)
# Overwritten values will not be reflected in the O9DataLake after initialization

from o9_common_utils.O9DataLake import O9DataLake, ResourceType, DataSource
factTable = O9DataLake.register("factTable",DataSource.LS, ResourceType.IBPL, _factTable)

In [ ]:
import pandas as pd
import logging
import random
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor
from o9helpers import user_storage_path
from o9storage import cloud_storage_utils, storage_utils
import pickle
import os
import shutil


logger = logging.getLogger('o9_logger')
logger.info('===== ITEM CLUSTERING SCRIPT ======')


def is_outlier(s):
    list_ = np.array(s)
    std_ = np.std(list_)
    if(np.isnan(std_)==True):
        lower_limit = s.mean()
        upper_limit = s.mean()
    else:
        lower_limit = s.mean() - (std_ * 2)
        upper_limit = s.mean() + (std_ * 2)
    return ~s.between(lower_limit, upper_limit)


versionName = factTable['Version.[Version Name]'].iloc[0]

logger.info('===== VERSION NAME ======')
logger.info(versionName)

factTable = factTable[["Supplier.[Supplier Location]","Activity2.[Activity2]",
                       "Location.[Location]","Item.[Item]","Time.[Day]",
                      "AL PO Header Creation Date TG","AL PO Goods Receipt Date TG","AL PO Net Price Per Unit TG"]]


logger.info(factTable.head())
logger.info(factTable.dtypes)

factTable['AL PO Goods Receipt Date TG'] = pd.to_datetime(factTable['AL PO Goods Receipt Date TG'])
factTable['AL PO Header Creation Date TG'] = pd.to_datetime(factTable['AL PO Header Creation Date TG'])
factTable['LeadTime'] = (factTable['AL PO Goods Receipt Date TG'] - factTable['AL PO Header Creation Date TG']).dt.days

factTable['Supplier.[Supplier]'] = factTable['Supplier.[Supplier Location]']
factTable = factTable.drop(['Supplier.[Supplier Location]'],axis=1)
factTable['year'] = factTable['AL PO Header Creation Date TG'].dt.year

factTable = factTable.drop(['AL PO Header Creation Date TG','AL PO Goods Receipt Date TG'],axis=1)
factTable['Time.[Day]'] = pd.to_datetime(factTable['Time.[Day]'])
factTable['Month'] = factTable['Time.[Day]'].apply(lambda x: x.strftime("%b"))

factTable = factTable.drop(["Time.[Day]"],axis=1)

####### Removing Outliers ##############

factTable = factTable[~factTable.groupby('Item.[Item]')['LeadTime'].apply(is_outlier)]

##### Clustering  ######

newForCluster = factTable.groupby("Item.[Item]").agg({"LeadTime":"mean","AL PO Net Price Per Unit TG":"mean"}).reset_index()

kmeanModel = KMeans(n_clusters=60)
kmeanModel.fit(newForCluster.drop(["Item.[Item]"],axis=1))
newForCluster['Cluster']=kmeanModel.predict(newForCluster.drop(["Item.[Item]"],axis=1))

ItemCluster = newForCluster[["Item.[Item]","Cluster"]]

logger.info("*** Clusters *****")
logger.info(ItemCluster)

ItemCluster["Version.[Version Name]"] = versionName

ItemCluster = ItemCluster[["Version.[Version Name]","Item.[Item]","Cluster"]]


logger.info("============ Clusters Calculated ============")
logger.info(ItemCluster["Cluster"].unique())
logger.info(ItemCluster.head(12))


logger.info("=========== Clustering Completed ==========")