In [1]:
%matplotlib inline  

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Use the retailer loss percentage data taken from USDA:   
  https://www.ers.usda.gov/data-products/food-availability-per-capita-data-system/loss-adjusted-food-availability-documentation/  
https://www.ers.usda.gov/webdocs/publications/47570/8043_tb1927.pdf?v=41056

In [2]:
%%writefile retailer_loss.tsv
Commodity	Food supply (billion lbs)	Losses - Retail (billion lbs)	Losses - Retail (percent)	Losses - Consumer (billion lbs)	Losses - Consumer (percent)	Losses - Total (billion lbs)	Losses - Total (percent)
Grain products	60.4	7.2	12	11.3	19	18.5	31
Fruit	64.3	6.0	9	12.5	19	18.4	29
Fresh fruit	37.6	4.4	12	9.5	25	13.9	37
Processed fruit	26.7	1.6	6	2.9	11	4.5	17
Vegetables	83.9	7.0	8	18.2	22	25.2	30
Fresh vegetables	53.5	5.2	10	12.8	24	18.0	34
Processed vegetables	30.4	1.8	6	5.3	18	7.1	24
Dairy products	83.0	9.3	11	16.2	20	25.4	31
Fluid milk	53.8	6.5	12	10.5	20	17.0	32
Other dairy products	29.1	2.8	10	5.7	19	8.5	29
Meat, poultry, and fish	58.4	2.7	5	12.7	22	15.3	26
Meat	31.6	1.4	4	7.2	23	8.6	27
Poultry	22.0	0.9	4	3.9	18	4.8	22
Fish and seafood	4.8	0.4	8	1.5	31	1.9	39
Eggs	9.8	0.7	7	2.1	21	2.8	28
Tree nuts and peanuts	3.5	0.2	6	0.3	9	0.5	15
Added sugar and sweeteners	40.8	4.5	11	12.3	30	16.7	41
Added fats and oils	26.0	5.4	21	4.5	17	9.9	38
Total	430.0	43.0	10	89.9	21	132.9	31

Writing retailer_loss.tsv


In [27]:
raw_df = pd.read_csv('./retailer_loss.tsv', sep='\t')

In [28]:
df = raw_df.copy()
# drop last column, the total column
df.drop(df.tail(1).index,inplace=True)
df

Unnamed: 0,Commodity,Food supply (billion lbs),Losses - Retail (billion lbs),Losses - Retail (percent),Losses - Consumer (billion lbs),Losses - Consumer (percent),Losses - Total (billion lbs),Losses - Total (percent)
0,Grain products,60.4,7.2,12,11.3,19,18.5,31
1,Fruit,64.3,6.0,9,12.5,19,18.4,29
2,Fresh fruit,37.6,4.4,12,9.5,25,13.9,37
3,Processed fruit,26.7,1.6,6,2.9,11,4.5,17
4,Vegetables,83.9,7.0,8,18.2,22,25.2,30
5,Fresh vegetables,53.5,5.2,10,12.8,24,18.0,34
6,Processed vegetables,30.4,1.8,6,5.3,18,7.1,24
7,Dairy products,83.0,9.3,11,16.2,20,25.4,31
8,Fluid milk,53.8,6.5,12,10.5,20,17.0,32
9,Other dairy products,29.1,2.8,10,5.7,19,8.5,29


In [29]:
df.columns.values

array(['Commodity', 'Food supply (billion lbs)',
       'Losses - Retail (billion lbs)', 'Losses - Retail (percent)',
       'Losses - Consumer (billion lbs)', 'Losses - Consumer (percent)',
       'Losses - Total (billion lbs)', 'Losses - Total (percent)'], dtype=object)

In [34]:
df_retailer = df[df.columns.values[:4]]
df_retailer.head()

Unnamed: 0,Commodity,Food supply (billion lbs),Losses - Retail (billion lbs),Losses - Retail (percent)
0,Grain products,60.4,7.2,12
1,Fruit,64.3,6.0,9
2,Fresh fruit,37.6,4.4,12
3,Processed fruit,26.7,1.6,6
4,Vegetables,83.9,7.0,8


In [35]:
total_supply = df_retailer['Food supply (billion lbs)'].sum()
print(total_supply)
df_retailer['supply_normalized_percentage'] = df_retailer['Food supply (billion lbs)'].apply(lambda x: float(x) / total_supply)
df_retailer

719.6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,Commodity,Food supply (billion lbs),Losses - Retail (billion lbs),Losses - Retail (percent),supply_normalized_percentage
0,Grain products,60.4,7.2,12,0.083936
1,Fruit,64.3,6.0,9,0.089355
2,Fresh fruit,37.6,4.4,12,0.052251
3,Processed fruit,26.7,1.6,6,0.037104
4,Vegetables,83.9,7.0,8,0.116593
5,Fresh vegetables,53.5,5.2,10,0.074347
6,Processed vegetables,30.4,1.8,6,0.042246
7,Dairy products,83.0,9.3,11,0.115342
8,Fluid milk,53.8,6.5,12,0.074764
9,Other dairy products,29.1,2.8,10,0.040439


In [38]:
def get_normalized_loss(r):
    return r['supply_normalized_percentage'] * r['Losses - Retail (percent)'] / 100
df_retailer['retail_normalized_loss'] = df_retailer.apply(get_normalized_loss, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [39]:
df_retailer[['Commodity', 'supply_normalized_percentage', 'retail_normalized_loss']].head(5)

Unnamed: 0,Commodity,supply_normalized_percentage,retail_normalized_loss
0,Grain products,0.083936,0.010072
1,Fruit,0.089355,0.008042
2,Fresh fruit,0.052251,0.00627
3,Processed fruit,0.037104,0.002226
4,Vegetables,0.116593,0.009327


In [83]:
df_retailer_normalized = df_retailer[['Commodity', 'supply_normalized_percentage', 'retail_normalized_loss']]

In [113]:
commodity_idx = np.random.choice(len(df_retailer_normalized),p = df_retailer_normalized['supply_normalized_percentage'].values)
df_retailer_normalized.ix[commodity_idx]

Commodity                       Dairy products
supply_normalized_percentage          0.115342
retail_normalized_loss               0.0126876
Name: 7, dtype: object

In [116]:
df_retailer_normalized.to_dict(orient='records')

[{'Commodity': 'Grain products',
  'retail_normalized_loss': 0.010072262367982213,
  'supply_normalized_percentage': 0.08393551973318511},
 {'Commodity': 'Fruit',
  'retail_normalized_loss': 0.008041967759866592,
  'supply_normalized_percentage': 0.08935519733185103},
 {'Commodity': 'Fresh fruit',
  'retail_normalized_loss': 0.006270150083379656,
  'supply_normalized_percentage': 0.05225125069483047},
 {'Commodity': 'Processed fruit',
  'retail_normalized_loss': 0.0022262367982212343,
  'supply_normalized_percentage': 0.037103946637020574},
 {'Commodity': 'Vegetables',
  'retail_normalized_loss': 0.009327404113396333,
  'supply_normalized_percentage': 0.11659255141745416},
 {'Commodity': 'Fresh vegetables',
  'retail_normalized_loss': 0.007434685936631463,
  'supply_normalized_percentage': 0.07434685936631463},
 {'Commodity': 'Processed vegetables',
  'retail_normalized_loss': 0.0025347415230683712,
  'supply_normalized_percentage': 0.042245692051139525},
 {'Commodity': 'Dairy products

## Now we can finally start simulate some data, for the datasets I am simulating a Gaussian Distribution over the normalized loss

In [149]:
num_retailer_stores = 2
num_days = 30
sigma_supply = 0.005
sigma_loss = 0.0001

In [166]:
# personalities? more wasteful, less wasteful, medium wasteful
def getRetailerPersonality(id):
    wasteful_match = {
        0: 0.0001,
        1: 0.0,
        2: -0.0001
    }
    t = id % len(wasteful_match)
    return wasteful_match[t]
getRetailerPersonality(2)

-0.0001

In [167]:
def generateData():
    data = []
    def generateSupplyGauss(s):
        return np.random.normal(s, sigma_supply, 1)[0]
    def generateLossGauss(s):
        return np.random.normal(s, sigma_loss, 1)[0]
    
    for d in range(num_days):
        for i in range(num_retailer_stores):
            retailer_supply = df_retailer_normalized.copy()
            retailer_supply['retailer_id'] = i
            retailer_supply['day'] = d
            
            # Simulated Supply
            retailer_supply['generated_supply'] = retailer_supply['supply_normalized_percentage'].apply(generateSupplyGauss)
            # Simulated Loss
            retailer_supply['generated_loss'] = retailer_supply['retail_normalized_loss'].apply(generateLossGauss)
            retailer_supply['generated_loss'] = retailer_supply['generated_loss'] + getRetailerPersonality(i)
            data += retailer_supply.to_dict(orient='records')
    return pd.DataFrame(data)[['retailer_id','day','Commodity','generated_loss', 'generated_supply']]
data = generateData()
data.sample(5)

Unnamed: 0,retailer_id,day,Commodity,generated_loss,generated_supply
93,1,2,Processed fruit,0.002232,0.039932
82,0,2,"Meat, poultry, and fish",0.004053,0.082309
327,0,9,Processed fruit,0.002432,0.045891
1026,1,28,Grain products,0.009995,0.082032
625,0,17,Fish and seafood,0.000556,0.007389


In [168]:
data[data['Commodity'] == 'Processed fruit']['generated_loss'] * 100

3       0.219322
21      0.232736
39      0.228215
57      0.235674
75      0.249599
93      0.223249
111     0.239051
129     0.209431
147     0.241511
165     0.207488
183     0.228024
201     0.218354
219     0.234289
237     0.224820
255     0.213334
273     0.200007
291     0.225418
309     0.222718
327     0.243208
345     0.214972
363     0.237026
381     0.228345
399     0.220064
417     0.214077
435     0.225071
453     0.209387
471     0.231394
489     0.212625
507     0.244257
525     0.230853
543     0.226159
561     0.240308
579     0.242863
597     0.222718
615     0.236557
633     0.226685
651     0.214581
669     0.229023
687     0.225206
705     0.215857
723     0.233284
741     0.228777
759     0.240364
777     0.223778
795     0.242110
813     0.234733
831     0.219457
849     0.218676
867     0.238694
885     0.223831
903     0.235043
921     0.221125
939     0.224255
957     0.214961
975     0.241156
993     0.211765
1011    0.223370
1029    0.225583
1047    0.2340