In [None]:
#!/bin/python3

import math
import os
import random
import re
import sys

# Standard data science library imports. You may try others.
import pandas as pd
import numpy as np
import sklearn
import scipy

#
# Complete the 'main' function below with your code.
#
# You can access the data with the following commands :
# customers_df = pd.read_csv("customer_data.csv")
# orders_df = pd.read_csv("orders_data.csv")

from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math

def main():
    # Write your code here
    customers_df = pd.read_csv("customer_data.csv")
    orders_df = pd.read_csv("orders_data.csv")
    
    # compute additional columns
    orders_df['revenue'] = orders_df['price'] * orders_df['quantity']
    orders_df['year'] = pd.to_datetime(orders_df['transaction_date']).dt.year
    
    # customer level features
    
    def get_age(x):
        """
        input: birth_date in any format
        returns age in years
        """
        age = datetime.now().year - pd.to_datetime(x).year
        return age
    
    customers_df['age'] = customers_df['birth_date'].apply(lambda x: get_age(x))
    
    # join info and compute aggregated table
    df = pd.merge(customers_df, orders_df, on='customer_id', how='inner')
    stats_df = df.groupby(['year', 'customer_id']).agg({
        'revenue' : 'sum',
        'product' : 'nunique',
        'transaction_date' : 'nunique'
        }).reset_index()

    # do a self-join for shifted features
    shifted_df = stats_df[['year', 'customer_id', 'revenue', 'product', 'transaction_date']]
    shifted_df['year'] = shifted_df['year'] + 1
    
    shifted_df.rename(columns={
        'revenue' : 'prev_year_revenue',
        'product' : 'prev_year_nb_products',
        'transaction_date' : 'prev_year_frequency',
        
        }, inplace=True)
    
    stats_df.rename(columns={
            'revenue' : 'revenue',
            'product' : 'nb_products',
            'transaction_date' : 'frequency',
            }, inplace=True)     
    
    # add shifted features
    stats_df = pd.merge(stats_df, shifted_df, on=['year' , 'customer_id'], how='inner')
    
    # join customer profile features
    stats_df = pd.merge(stats_df, customers_df[['customer_id', 'age', 'acquisition_channel']])
    
    # Question 3:
    # 1. acquisition_channel added. The rationale is that this column may tell
    # us customer's preferences
    # which itself could be driven by many demographics, personality or motivation factors that might
    # be very useful to capture
    
    # 2. frequency and prev_year_frequency features are added that capture a customer's shopping
    # behaviour 
    
    # Modelling
    
    # Assumption: The intention is to forecast into the future. Therefore, 
    # current year features are not used. If the intention is to understand contributions of 
    # within year contributors as well, then we would've included the current year information as well.
    
    ##  train test split

    train_df = stats_df[stats_df['year'] != 2018]
    test_df = stats_df[stats_df['year'] == 2018]
    
    # transform data
    def transform_data(df):
        df = pd.get_dummies(df.drop(columns=['year', 'customer_id']), drop_first=True)
        feature_set = ['revenue', 'prev_year_revenue',
        'prev_year_nb_products', 'prev_year_frequency', 'age',
        'acquisition_channel_radio', 'acquisition_channel_tv',
        'acquisition_channel_web']
        
        return df[feature_set]
    
    train_df = transform_data(train_df)
    test_df = transform_data(test_df)
    
    # train model
    reg = LinearRegression().fit(train_df.drop(columns=['revenue']), train_df['revenue'])
    
    # compute test set performance
    y_hat = reg.predict(test_df.drop(columns=['revenue']))
    y = test_df['revenue']
    rmse = math.sqrt(mean_squared_error(y, y_hat))

    return rmse
    
    
    
    
    
    
    



If the revenue is a proxy for demand, revenue per transaction is calculated as price * quantity. Therefore, we'll need price in addition to the quantity of sold products. If this is the only available information (in addition to customer IDs and transaction IDs), we'll need to revert back to a forecasting model instead. If more features are available on customers as well as orders, we can then move to a hybrid (forecast  + regression) or a pure regression model.

    main()




p(buyer) = 0.01
p(non-buyer) = 0.99
p(buy | buyer) = 0.99
p(buy | non-buyer) = 0.04

p(buyer | buy) = p(buy | buyer) . p(buyer)
                         p(buy)

p(buy) = p(buy | buyer) . p(buyer) + p(buy | non-buyer) . p(non-buyer)

= 0.99 * 0.01
---------------
 0.99 * 0.01 +  0.04 * 0.99

p = 0.20