In [127]:
import pandas as pd
import numpy as np

In [128]:
from google.cloud import bigquery
client = bigquery.Client()

### Market Bucket Analysis

In [7]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [8]:
# Convert the DataFrame into a list of transactions
transactions = df.groupby(["trans_id", "cust_id"])["prod_category"].apply(list).values.tolist()

In [None]:
transactions_df = pd.DataFrame({"transaction": transactions})

In [None]:
# Mine frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(transactions_df["transaction"].apply(pd.Series).fillna(-1),
                            min_support=0.1, 
                            use_colnames=True)

In [None]:
# Mine association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=3)

In [None]:
# Get the complements for each category
complements = {}
for category in df["prod_category"].unique():
    complement = rules.loc[rules["antecedents"].apply(lambda x: category in x)]["consequents"].values.tolist()
    complements[category] = [item for sublist in complement for item in sublist if item != category]

# Create a DataFrame to store the complements for each category
complements_df = pd.DataFrame({"Category": list(complements.keys()), "Complements": list(complements.values())})


### Linear Regression for each product

In [129]:
import numpy as np

In [130]:
sql = """
SELECT * FROM `mkt.pricing_data2`
"""
df = client.query(sql).to_dataframe()

In [131]:
df.head()

Unnamed: 0,prod_id,week_num,log_demand,price_div_demand,sub_price,com_prices,log_sv,promo_perc
0,20019693005,0,4.406719,0.279767,24.643086,19.16337,10.647404,0.742787
1,20019693005,1,4.941642,0.202609,24.234679,18.9212,10.647404,0.695596
2,20019693005,2,5.129899,0.164373,25.336094,18.552415,10.647404,0.700787
3,20019693005,3,5.247024,0.138155,25.348554,16.399644,10.647404,0.714741
4,20019693005,4,5.123964,0.153861,22.585577,18.688869,10.647404,0.7175


In [132]:
# Check missing values
df.isna().sum()

prod_id             0
week_num            0
log_demand          0
price_div_demand    0
sub_price           0
com_prices          0
log_sv              0
promo_perc          0
dtype: int64

In [133]:
# Check inf values
df.applymap(lambda x: np.isinf(x)).sum()

prod_id             0
week_num            0
log_demand          0
price_div_demand    0
sub_price           0
com_prices          0
log_sv              0
promo_perc          0
dtype: int64

In [64]:
#df.fillna(df.mean(), inplace=True)

In [134]:
from statsmodels.formula.api import ols
prod_ids = df['prod_id'].unique()

In [135]:
# Fit regression models for each product ID
models = {}
for prod_id in prod_ids:
    product_data = df[df['prod_id'] == prod_id]
    model = ols("log_demand ~ price_div_demand + sub_price + com_prices + log_sv + promo_perc", data=product_data).fit()
    models[prod_id] = model

In [136]:
from scipy.optimize import root_scalar
def elasticity(p, a, b, c, d, e, f, sub, com, sv, promo):
    return b * p / (1 + np.exp(-(a + b * p + c * sub + d * com + e * sv + f * promo)))
def elasticity_diff(p, a, b, c, d, e, f, sub, com, sv, promo):
    exponent = - (a + b * p + c * sub + d * com + e * sv + f * promo)
    exponent_clipped = np.clip(exponent, -np.inf, 700)  # Clip the exponent to avoid overflow
    return b * p / (1 + np.exp(exponent_clipped)) + 1

In [141]:
optimal_p_values = {}
for prod_id, model in models.items():
    product_data = df[df['prod_id'] == prod_id]
    
    # Get the average values for the independent variables
    avg_sub = product_data['sub_price'].mean()
    avg_com = product_data['com_prices'].mean()
    avg_sv = product_data['log_sv'].mean()
    avg_promo = product_data['promo_perc'].mean()
    
    # Get the regression coefficients
    a, b, c, d, e, f = model.params['Intercept'], model.params['price_div_demand'], model.params['sub_price'], model.params['com_prices'], model.params['log_sv'], model.params['promo_perc']
    
    # Find the optimal p using the root_scalar function
    res = root_scalar(elasticity_diff, args=(a, b, c, d, e, f, avg_sub, avg_com, avg_sv, avg_promo), method='secant', x0=0.0001, x1=0.01)
    optimal_p_values[prod_id] = res.root



In [142]:
optimal_p_values

{20019693005: 0.15029054859653473,
 20299821: 0.6493439729699857,
 20975929: 0.2494203580246918,
 20118904005: 0.21762461701800026,
 20972512: 0.0722902897350783,
 20996495: 0.09963589789201686,
 21004369: 0.05221664374596423,
 20264273: 0.013158916965172053,
 20829110008: 0.147873921046034,
 20903492: 0.09686453770455726,
 20681175: 0.08990663624634426,
 20287689: 0.20236107363856864,
 20025432002: 0.0060933949380951565,
 20314880: 0.03600749933906036,
 20788914: 0.09017895326092154,
 20025699: 0.061888535819664976,
 20318969: 0.026602206432643938,
 20088387001: 0.028175073956822207,
 20562728: 0.023903703838726493,
 20415316: 0.0810970855172847,
 20119926001: 0.008849739244194849,
 20431483002: 0.025240917037051377,
 21018776: 0.004293991358940699,
 20023746002: 0.02384981381489843,
 20744442: 0.007132053695116405,
 20008233001: 0.008742461751228912,
 20997492: 0.006976520571839177,
 20651933: 0.08694370479102204,
 20322468003: 0.0874781572585968,
 21093316: 0.027901932373818106,
 20

In [143]:
# Create an empty list to store the results
optimal_prices = []
optimal_demands = []

for prod_id in prod_ids:
    model = models[prod_id]
    a, b, c, d, e, f = model.params['Intercept'], model.params['price_div_demand'], model.params['sub_price'], model.params['com_prices'], model.params['log_sv'], model.params['promo_perc']
    product_data = df[df['prod_id'] == prod_id]
    avg_demand = np.exp(product_data['log_demand'].mean())
    avg_sub_price = product_data['sub_price'].mean()
    avg_com_prices = product_data['com_prices'].mean()
    avg_log_sv = product_data['log_sv'].mean()
    avg_promo_perc = product_data['promo_perc'].mean()

    optimal_p = optimal_p_values[prod_id]
    normal_price = optimal_p * avg_demand
    optimal_demand_logit = a + b*optimal_p + c*avg_sub_price + d*avg_com_prices + e*avg_log_sv + f*avg_promo_perc
    optimal_demand = np.exp(optimal_demand_logit)

    optimal_prices.append(normal_price)
    optimal_demands.append(optimal_demand)

In [144]:
optimal_df = pd.DataFrame({'prod_id': prod_ids, 'optimal_price': optimal_prices, 'optimal_demand': optimal_demands})

In [145]:
optimal_df['optimal_revenue'] = optimal_df['optimal_price'] * optimal_df['optimal_demand']

In [146]:
optimal_df

Unnamed: 0,prod_id,optimal_price,optimal_demand,optimal_revenue
0,20019693005,29.409290,1.690094e+02,4.970446e+03
1,20299821,57.781717,5.871190e+01,3.392475e+03
2,20975929,55.500586,1.655696e+02,9.189212e+03
3,20118904005,35.369724,1.107973e+02,3.918871e+03
4,20972512,19.210539,1.869301e+02,3.591028e+03
...,...,...,...,...
189,20163119001,380.111728,4.302313e-18,1.635360e-15
190,20812144001,425.030882,2.880539e-60,1.224318e-57
191,20038964,141.092531,7.629674e-44,1.076490e-41
192,20099819001,147.277540,2.724296e+02,4.012276e+04


In [147]:
optimal_df.to_csv("gs://rochjia/optimal.csv")