In [None]:
import os
import numpy as np

from ds_behavioral import DataBuilder
from ds_behavioral.sample.sample_data import CallCentreSamples, ProfileSample


In [None]:
class Synthetic(object):
    """ Builds a synthetic Datasets"""

    @staticmethod
    def Customer(noise: bool=False, extra: bool=False, sample_size: int=None):
        builder = DataBuilder('synthetic_data_customer')
        tools = builder.tools

        # main build
        sample_size = sample_size if isinstance(sample_size, int) and 1 <= sample_size <= 10000 else 500
        df = tools.get_profiles(size=sample_size, mf_weighting=[5, 3])
        df['id'] = tools.unique_identifiers(from_value=1000000, to_value=9999999, prefix='CU_', size=sample_size)
        value_distribution = [0.01, 0.8, 1, 3, 9, 8, 3, 2, 1] + list(np.flip(np.exp(np.arange(-5, 0.0, 0.2)).round(2)))
        df['balance'] = tools.get_number(0.0, 1000, precision=2, weight_pattern=value_distribution, size=sample_size)
        age_pattern = [3, 5, 6, 10, 6, 5, 7, 15, 5, 2, 1, 0.5, 0.2, 0.1]
        df['age'] = tools.get_number(20.0, 90.0, weight_pattern=age_pattern, quantity=0.85, size=sample_size)
        df['start'] = tools.get_datetime(start='01/01/2018', until='31/12/2018', date_format='%m-%d-%y',
                                         size=sample_size)
        prof_pattern = [10, 8, 5, 4, 3, 2] + [1] * 9
        profession = ProfileSample.professions(size=15)
        df['profession'] = tools.get_category(selection=profession, weight_pattern=prof_pattern, quantity=0.90,
                                              size=sample_size)
        df['online'] = tools.get_category(selection=[1, 0], weight_pattern=[1, 4], size=sample_size)

        # Selective Noise
        df['single num'] = tools.get_number(1, 1, quantity=0.8, size=sample_size, seed=31)
        df['weight_num'] = tools.get_number(1, 2, weight_pattern=[90, 1], size=sample_size, seed=31)
        df['null'] = tools.get_number(1, 100, quantity=0, size=sample_size, seed=31)
        df['single cat'] = tools.get_category(['A'], quantity=0.6, size=sample_size, seed=31)
        df['weight_cat'] = tools.get_category(['A', 'B', 'C'], weight_pattern=[80, 1, 1], size=sample_size, seed=31)

        # Optional extra fields
        if extra:
            df['last_login'] = tools.get_datetime(start='01/01/2019', until='01/05/2019',
                                                  date_pattern=[1, 2, 3, 5, 9, 20], date_format='%m-%d-%y %H:%M', size=sample_size)
            df['status'] = tools.get_category(selection=['Active', 'Pending', 'Closed', 'Suspended'], weight_pattern=[20, 10, 5, 1],
                                              size=sample_size)
        # Optional extra noise
        if noise:
            for i in range(40):
                quantity = tools.get_number(0.005, 0.03, weight_pattern=[5, 2, 1, 0.5])[0]
                col = "noise_{}".format(i)
                df[col] = tools.get_number(0, 1, weight_pattern=[20, 1], quantity=quantity, size=sample_size)

        # save
        filename = os.path.join(os.environ['DTU_ORIGIN_PATH'], '0_raw', 'synthetic_customer.csv')
        builder.save_to_disk(df, filename=filename)
        return

    @staticmethod
    def Agent(sample_size: int=None):
        builder = DataBuilder('synthetic_data_agent')
        tools = builder.tools

        # main build
        df = pd.DataFrame()
        df['call_id'] = tools.unique_identifiers(from_value=1000000, to_value=9999999, size=sample_size)
        customer = Transition('synthetic_customer').load_source_canonical()
        df['customer_id'] = tools.get_reference(df=customer, header='id', weight_pattern=[20,10,7,6,5,4,3,2,1,0.5], at_most=10, size=sample_size)
        weekday_pattern = [5,2,4,2,1,0,0]
        hour_pattern = [0,0,0,0,0,0,1,4,6,9,6,3,1,3,4,5,5,8,7,5,3,2,1,0]
        df['call_date'] = tools.get_datetime(start='2018/10/01', until='2018/10/31', ordered=True, date_format='%y-%m-%d %H:%M:%S', year_first=True,
                                           weekday_pattern=weekday_pattern, hour_pattern=hour_pattern, size=sample_size)
        contact_pattern = [10,3,2,1,1,1] + [0.1]*7
        contact = CallCentreSamples.contact_type(shuffle=False)
        df['contact'] = tools.get_category(selection=contact, weight_pattern=contact_pattern, size=sample_size)
        complaint_pattern = [10,9,7,6,5,4,3,2] + [1]*10 + [0.5]*11
        complaint = CallCentreSamples.complaint(shuffle=False)
        df['complaint'] = tools.get_category(selection=complaint, weight_pattern=complaint_pattern, size=sample_size)
        agent_pattern = [10,5,1,0.2]
        agents = ProfileSample.female_names(size=40)
        df['agent'] = tools.get_category(selection=agents, weight_pattern=agent_pattern, size=sample_size)
        minute_pattern = [10,5,3,1,0.5,0.2]
        df['duration'] = tools.get_datetime(start='01/01/2018 00:01', until='01/01/2018 00:30', ordered=True, date_format='%M:%S', 
                                               minute_pattern=minute_pattern, size=sample_size)
        df['escalated'] = tools.get_category(selection=[1,0], weight_pattern=[1,[1000,10,1000]], size=sample_size)
        df['referred'] = tools.get_category(selection=[1,0], weight_pattern=[1,20], size=sample_size)

        # Optional extra noise
        for i in range(40):
            quantity = tools.get_number(0.005, 0.03, weight_pattern=[5, 2, 1, 0.5])[0]
            col = "noise_{}".format(i)
            df[col] = tools.get_number(0, 1, weight_pattern=[20, 1], quantity=quantity, size=sample_size)

        # save
        filename = os.path.join(os.environ['DTU_ORIGIN_PATH'], '0_raw', 'synthetic_agent.csv')
        builder.save_to_disk(df, filename=filename)
        return

    