# This notebook will generate the synthetic data required as an input to our Facebook/Prophet model
- Some noise will be added to the data to reflect real data!
- This notebook output will be saved in the data folder (RAW)

## Two tables will be generated:
- The first table will attach every product to a cluster of products (5 clusters A, B, C, D and E will be used):

| Product_code  |  Associated cluster |
|---|---|
| CLA01  |  A  |
| CLA02  |   E  |
| CLB01  |   A  |

The product code 3 first characters define the client : CLA : Client A , CLB : Client B etc.

- The second table is the actual Sales history table:

| Product_code  | Date  |  Quantity |
|---|---|---|
|  CLB01 | 25/07/2019  | 1,000  |
|  CLB01 | 19/07/2019  |  1,500 |
|   CLA02 | 23/07/2019  |  10,000 |

### Product table generation

#### 1. Import the famous pandas and numpy libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
#This sets the seed to always generate the same data

from numpy.random import RandomState
random_state=RandomState(9999)

In [3]:
#Input parameters

number_of_clients=5
min_number_of_products_per_client=1
max_number_of_products_per_client=6
number_of_clusters=5

In [4]:
number_of_products_list=random_state.randint(min_number_of_products_per_client,
                                             max_number_of_products_per_client,
                                             number_of_clients)

In [5]:
number_of_products_list

array([2, 5, 2, 4, 1])

In [6]:
def letter_range(start, stop="{", step=1):
    """Yield a range of lowercase letters.""" 
    for ord_ in range(ord(start.upper()), ord(stop.upper()), step):
        yield chr(ord_)

In [7]:
clients_list=['CL'+ l for l in letter_range(chr(97),chr(97+number_of_clients))]

In [8]:
clients_list

['CLA', 'CLB', 'CLC', 'CLD', 'CLE']

In [9]:
product_codes=[(client,client+str(i)) for number_of_products,client in zip(number_of_products_list,clients_list) 
               for i in range(number_of_products) ]

In [10]:
product_codes

[('CLA', 'CLA0'),
 ('CLA', 'CLA1'),
 ('CLB', 'CLB0'),
 ('CLB', 'CLB1'),
 ('CLB', 'CLB2'),
 ('CLB', 'CLB3'),
 ('CLB', 'CLB4'),
 ('CLC', 'CLC0'),
 ('CLC', 'CLC1'),
 ('CLD', 'CLD0'),
 ('CLD', 'CLD1'),
 ('CLD', 'CLD2'),
 ('CLD', 'CLD3'),
 ('CLE', 'CLE0')]

In [11]:
index = pd.MultiIndex.from_tuples(product_codes, names=['first', 'second'])

In [12]:
product_table=pd.Series(random_state.randint(0,number_of_clusters, len(product_codes)), index=index)

In [13]:
product_table.rename('Cluster',inplace=True)

first  second
CLA    CLA0      0
       CLA1      2
CLB    CLB0      4
       CLB1      0
       CLB2      1
       CLB3      3
       CLB4      1
CLC    CLC0      0
       CLC1      4
CLD    CLD0      3
       CLD1      4
       CLD2      4
       CLD3      0
CLE    CLE0      4
Name: Cluster, dtype: int64

In [14]:
product_table

first  second
CLA    CLA0      0
       CLA1      2
CLB    CLB0      4
       CLB1      0
       CLB2      1
       CLB3      3
       CLB4      1
CLC    CLC0      0
       CLC1      4
CLD    CLD0      3
       CLD1      4
       CLD2      4
       CLD3      0
CLE    CLE0      4
Name: Cluster, dtype: int64

#### Now building the Sales table

In [15]:
products_list=product_table.index.levels[1].values

In [16]:
# This sets the frequencies in days for each product which will be used for the dates generator
products_frequencies=random_state.randint(1,365.25//2,len(products_list))

In [17]:
products_frequencies

array([ 69, 174, 143,  40,  44, 138, 148, 159,  42,   4,  90,  12, 162,
       138])

In [18]:
start_date='1/1/2018'
end_date='26/07/2019'

In [19]:
products_dates_list=[pd.date_range(start=start_date,
             end=end_date,
              freq=str(freq)+'D') for freq in products_frequencies]

In [20]:
products_dates_list[:2]

[DatetimeIndex(['2018-01-01', '2018-03-11', '2018-05-19', '2018-07-27',
                '2018-10-04', '2018-12-12', '2019-02-19', '2019-04-29',
                '2019-07-07'],
               dtype='datetime64[ns]', freq='69D'),
 DatetimeIndex(['2018-01-01', '2018-06-24', '2018-12-15', '2019-06-07'], dtype='datetime64[ns]', freq='174D')]

In [21]:
quantities_list=random_state.randint(1000,10000,len(products_list))

In [22]:
quantities_list

array([5064, 4795, 1173, 5766, 3261, 5953, 3704, 6917, 6315, 5604, 4296,
       4448, 1649, 8754])

In [23]:
exact_sales_df=pd.DataFrame([(prod,date.ctime(),quant) for prod,quant,dates_list 
                             in zip(products_list,quantities_list,products_dates_list) 
              for date in dates_list],
            columns=['product_code',
                     'date',
                    'quantity'])

In [24]:
exact_sales_df['date']=pd.to_datetime(exact_sales_df['date'])

In [25]:
exact_sales_df.head()

Unnamed: 0,product_code,date,quantity
0,CLA0,2018-01-01,5064
1,CLA0,2018-03-11,5064
2,CLA0,2018-05-19,5064
3,CLA0,2018-07-27,5064
4,CLA0,2018-10-04,5064


In [26]:
days_noise,quantities_noise=10,0.1

In [27]:
sales_df=exact_sales_df.copy()

In [28]:
sales_df['random_date_delta']=np.array([pd.to_timedelta(str(i)+'D') 
                                         for i in random_state.randint(-days_noise,
                                                                       days_noise,sales_df.shape[0])])

In [29]:
sales_df.head()

Unnamed: 0,product_code,date,quantity,random_date_delta
0,CLA0,2018-01-01,5064,-4 days
1,CLA0,2018-03-11,5064,0 days
2,CLA0,2018-05-19,5064,-1 days
3,CLA0,2018-07-27,5064,-2 days
4,CLA0,2018-10-04,5064,0 days


In [30]:
#If you want to add noise :
sales_df['noisy_date']=sales_df['date']+sales_df['random_date_delta']
#If you want to remove noise:
#sales_df['noisy_date']=sales_df['date']

In [31]:
sales_df.head()

Unnamed: 0,product_code,date,quantity,random_date_delta,noisy_date
0,CLA0,2018-01-01,5064,-4 days,2017-12-28
1,CLA0,2018-03-11,5064,0 days,2018-03-11
2,CLA0,2018-05-19,5064,-1 days,2018-05-18
3,CLA0,2018-07-27,5064,-2 days,2018-07-25
4,CLA0,2018-10-04,5064,0 days,2018-10-04


In [32]:
sales_df['random_qty_delta']=sales_df['quantity']*random_state.uniform(-quantities_noise,quantities_noise,
                                                                                   sales_df.shape[0])

In [33]:
sales_df.head()

Unnamed: 0,product_code,date,quantity,random_date_delta,noisy_date,random_qty_delta
0,CLA0,2018-01-01,5064,-4 days,2017-12-28,-25.380841
1,CLA0,2018-03-11,5064,0 days,2018-03-11,-405.804819
2,CLA0,2018-05-19,5064,-1 days,2018-05-18,-387.566538
3,CLA0,2018-07-27,5064,-2 days,2018-07-25,3.550234
4,CLA0,2018-10-04,5064,0 days,2018-10-04,-461.371602


In [34]:
#If you want to add noise:
sales_df['noisy_quantity']=sales_df['quantity']+sales_df['random_qty_delta']
#If you want to remove noise:
#sales_df['noisy_quantity']=sales_df['quantity']

In [35]:
sales_df.head()

Unnamed: 0,product_code,date,quantity,random_date_delta,noisy_date,random_qty_delta,noisy_quantity
0,CLA0,2018-01-01,5064,-4 days,2017-12-28,-25.380841,5038.619159
1,CLA0,2018-03-11,5064,0 days,2018-03-11,-405.804819,4658.195181
2,CLA0,2018-05-19,5064,-1 days,2018-05-18,-387.566538,4676.433462
3,CLA0,2018-07-27,5064,-2 days,2018-07-25,3.550234,5067.550234
4,CLA0,2018-10-04,5064,0 days,2018-10-04,-461.371602,4602.628398


In [36]:
noisy_sales_df=sales_df[['product_code',
                        'noisy_date',
                        'noisy_quantity']]

In [37]:
noisy_sales_df.head()

Unnamed: 0,product_code,noisy_date,noisy_quantity
0,CLA0,2017-12-28,5038.619159
1,CLA0,2018-03-11,4658.195181
2,CLA0,2018-05-18,4676.433462
3,CLA0,2018-07-25,5067.550234
4,CLA0,2018-10-04,4602.628398


In [38]:
noisy_sales_df.shape

(279, 3)

In [39]:
noisy_sales_df.to_csv('../data/raw/sales_table.csv',
                     index=False)

In [40]:
products_clusters=pd.DataFrame(product_table.droplevel(0))

In [41]:
products_clusters.rename_axis('product_code',axis=0,inplace=True)

In [42]:
products_clusters

Unnamed: 0_level_0,Cluster
product_code,Unnamed: 1_level_1
CLA0,0
CLA1,2
CLB0,4
CLB1,0
CLB2,1
CLB3,3
CLB4,1
CLC0,0
CLC1,4
CLD0,3


In [43]:
sales_cluster_df=noisy_sales_df.join(products_clusters,how='outer',on='product_code')

In [44]:
sales_cluster_df.shape

(279, 4)

In [45]:
sales_cluster_df.head()

Unnamed: 0,product_code,noisy_date,noisy_quantity,Cluster
0,CLA0,2017-12-28,5038.619159,0
1,CLA0,2018-03-11,4658.195181,0
2,CLA0,2018-05-18,4676.433462,0
3,CLA0,2018-07-25,5067.550234,0
4,CLA0,2018-10-04,4602.628398,0


In [46]:
sales_cluster_df.Cluster.value_counts()

3    148
4     78
0     32
1     17
2      4
Name: Cluster, dtype: int64

In [47]:
sales_cluster_df.to_csv('../data/raw/sales_cluster_table.csv',
                     index=False)