<a href="https://colab.research.google.com/github/smerner/cs146/blob/master/LocationBasedAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import matplotlib.pylab as plt
%matplotlib inline
import numpy as np
import pystan
from scipy.stats import cauchy, gamma, norm, expon, uniform
import pandas as pd

# loading data 
from google.colab import files
uploaded = files.upload() 

Saving lba_data.csv to lba_data (1).csv


In [0]:
# creating a Pandas DataFrame 
file_handler = open("lba_data.csv", "r") 
grocery_data = pd.read_csv(file_handler, sep = ",") 
file_handler.close() 

# creating a dict file 
Store = {'ALDI': 1,'EDEKA': 2, 'Lidl': 3, 'REWE': 4, 'Sainsbury': 5,'Tesco': 6,'Waitrose & Partners': 7,}

# traversing through data frame 
# through store column and writing new values where key matches
# https://www.geeksforgeeks.org/different-ways-to-iterate-over-rows-in-pandas-dataframe/ 
grocery_data.Store = [Store[item] for item in grocery_data.Store]

#checking if sucessful 
print(grocery_data)


      Store          Product  ...  Product price (€)    Neighborhood
0         3             Eggs  ...               1.89       Kreuzberg
1         1             Eggs  ...               1.09     Lichtenberg
2         2             Eggs  ...               1.19     Alt-Treptow
3         3             Eggs  ...               1.19     Alt-Treptow
4         1             Eggs  ...               1.19  Friedrichshain
...     ...              ...  ...                ...             ...
2372      2  Chicken Breasts  ...              13.25     Alt-Treptow
2373      2  Chicken Breasts  ...              30.00      Schöneberg
2374      2         Tomatoes  ...               1.99  Friedrichshain
2375      2         Tomatoes  ...               1.99       Kreuzberg
2376      5          Bananas  ...              75.00          London

[2377 rows x 6 columns]


In [0]:
Neighborhood = {'Alt-Treptow': 1, 'Friedrichshain': 2, 'Kreuzberg': 3, 'Lichtenberg': 4, 'London': 5, 'Mitte': 6, 'Neukölln': 7, 'Prenzlauer Berg': 8, 'Schöneberg': 9, 'Tempelhof': 10,}  

# traversing through data frame 
# through store column and writing new values where key matches
grocery_data.Neighborhood = [Neighborhood[x] for x in grocery_data.Neighborhood]
print(grocery_data)


      Store          Product  ...  Product price (€)  Neighborhood
0         3             Eggs  ...               1.89             3
1         1             Eggs  ...               1.09             4
2         2             Eggs  ...               1.19             1
3         3             Eggs  ...               1.19             1
4         1             Eggs  ...               1.19             2
...     ...              ...  ...                ...           ...
2372      2  Chicken Breasts  ...              13.25             1
2373      2  Chicken Breasts  ...              30.00             9
2374      2         Tomatoes  ...               1.99             2
2375      2         Tomatoes  ...               1.99             3
2376      5          Bananas  ...              75.00             5

[2377 rows x 6 columns]


In [0]:
# creating dict 
Product = {'Apples': 1, 'Bananas': 2, 'Butter': 3, 'Chicken Breasts': 4, 'Eggs': 5, 'Milk, full cream ': 6, 'Potatoes': 7, 'Rice, Basmati ': 8, 'Tomatoes': 9, 'White Flour': 10,}  

# traversing through data frame 
# through store column and writing new values where key matches
grocery_data.Product = [Product[x] for x in grocery_data.Product]
print(grocery_data)

      Store  Product  ...  Product price (€)  Neighborhood
0         3        5  ...               1.89             3
1         1        5  ...               1.09             4
2         2        5  ...               1.19             1
3         3        5  ...               1.19             1
4         1        5  ...               1.19             2
...     ...      ...  ...                ...           ...
2372      2        4  ...              13.25             1
2373      2        4  ...              30.00             9
2374      2        9  ...               1.99             2
2375      2        9  ...               1.99             3
2376      5        2  ...              75.00             5

[2377 rows x 6 columns]


In [0]:
# increasing index by 1 for ease in pystan since it indexes from 1 not 0 
grocery_data.index += 1
# checking if sucessful 
grocery_data.head(3)

Unnamed: 0,Store,Product,Price per unit,Product quantity (kg),Product price (€),Neighborhood
1,3,5,0.105,18.0,1.89,3
2,1,5,0.109,10.0,1.09,4
3,2,5,0.119,10.0,1.19,1


In [0]:
stan_code = '''

data {

// Data Block: data block contains all known quantities 

    int<lower=1> num;             // shows the number of observations
    int<lower=1> P;             // specified the number of products
    int<lower=1> S;             // specifies the number of stores
    int<lower=1> L;             // specifies the number of neighborhoods (locations)
    
     
    real<lower=0> price[num];    // prices
    int<lower=1> product[num];   // products
    int<lower=1> store[num];     // stores
    int<lower=1> location[num];  // locations (as defined by neighborhoods) 
    
    // Hyperparameters: hyperparameters for the cauchy and gamma distributions  
    
    real loc;       //  cauchy prior hyperparameters 
    real<lower=0> scale; // cauchy prior hyperparameters
    real<lower=0> alpha;  // gamma prior hyperparameter for
    real<lower=0> beta;   // gamma prior hyperparameter 
    real<lower=0> s_alpha;  // gamma prior hyperparameter 
    real<lower=0> s_beta;   // gamma  prior hyperparameter 
}
// Parameters: parameter info about price and product as well the multipliers by store & by location
// these are the parameters for which the entire model will be computing posteriors 
parameters {
  
    real<lower=0> base_price[P]; // setting this to store the base prices            
    real<lower=0> store_m[S];    // setting this to store multipliers
    real<lower=0> loc_m[L];     // setting this to store multipliers
    real<lower=0> sigma;        // standard deviation for normal
}

model{
    
    base_price ~ cauchy(loc, scale);
    store_m ~ gamma(alpha,beta);
    loc_m ~ gamma(alpha,beta);
    sigma ~ gamma(s_alpha,s_beta);
    
    // calculating the normal likelihood 
    for (i in 1:num){
            price[i] ~ normal(base_price[product[i]]* store_m[store[i]] * loc_m[location[i]], sigma);

}
}
'''
stan_model = pystan.StanModel(model_code=stan_code)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_eb236067cf5b6c8b90de0d5f9a09f9b3 NOW.


In [0]:
# data 
data = {'num': len(grocery_data),
    'P': grocery_data['Product'].nunique(),
    'S': grocery_data['Store'].nunique(),
    'L': grocery_data['Neighborhood'].nunique(),
    
    'price': np.asarray(grocery_data['Price per unit ']), 
    'product': np.asarray(grocery_data['Product']),
    'store': np.asarray(grocery_data['Store']),
    'location': np.asarray(grocery_data['Neighborhood']),
    
    # parameters set w/help from Hana 
    'loc': 7, 
    'scale': 7,
    'alpha': 3, 
    'beta': 0.5,
    's_alpha': 4,
    's_beta': 0.25}

results = stan_model.sampling(data=data)
samples = results.extract()


In [0]:
print(results)

Inference for Stan model: anon_model_eb236067cf5b6c8b90de0d5f9a09f9b3.
4 chains, each with iter=2000; warmup=1000; thin=1; 
post-warmup draws per chain=1000, total post-warmup draws=4000.

                 mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
base_price[1]    0.61    0.02   0.49   0.17    0.3   0.46   0.74   2.02    599   1.01
base_price[2]    0.46    0.02   0.38   0.12   0.23   0.35   0.56   1.55    610   1.01
base_price[3]    1.71    0.06   1.37   0.47   0.84    1.3   2.08   5.61    601   1.01
base_price[4]    2.14    0.07   1.71   0.58   1.05   1.63   2.59   7.02    605   1.01
base_price[5]    0.07  2.3e-3   0.07 6.2e-3   0.03   0.05   0.09   0.26    874    1.0
base_price[6]    0.25  8.2e-3   0.21   0.06   0.12   0.19   0.31   0.83    642   1.01
base_price[7]     0.3  9.7e-3   0.24   0.08   0.14   0.23   0.36   0.98    614   1.01
base_price[8]    0.79    0.03   0.63   0.21   0.39   0.61   0.96    2.6    607   1.01
base_price[9]    0.99    0.03   0.79 