In [19]:
!python --version

Python 3.8.8


In [4]:
import pandas as pd
import glob
import numpy as np
import datetime

In [12]:
def interactions_input():

    # reading in all available in directory
    all_dir = glob.glob("interactions*.csv")
    list_df = []
    for file in all_dir:
        df = pd.read_csv(file)
        list_df.append(df)

    # combine all
    interactions = pd.concat(list_df, axis=0)

    # convert to datetime
    interactions["interaction_time"] = pd.to_datetime(interactions["interaction_time"])
    interactions.sort_values(by=["user_id", "interaction_time"], inplace=True)

    print(interactions.shape)

    return interactions


def items_input():

    # read in
    items = pd.read_csv("items_catalog.csv").sort_values(by="brand")

    # deal with the item to brand errors
    items["brand"] = items["brand"].apply(lambda x: np.nan if x == "(not set)" else x)
    items1 = items[
        items.brand.notnull()
    ]  # existing brand there is still brand with multiple items but very few
    # making assumption that multiple brands have same item, in this don't know for the item bought which brand it is
    items2 = items[
        (items.brand.isnull()) & (~(items.item_id).isin(list(items1.item_id)))
    ]  # brand not found also items won't be repeated if it exists in brand already
    items = pd.concat([items1, items2], axis=0)  # combine
    items["brand"] = items["brand"].fillna(
        "(not set)"
    )  # setting it back to not set because we can consider it still as a type of brand that we are not aware of

    print(items.shape)

    return items


def joined_tb(interactions, items):

    # joined on left since if item has no customer then we don't really care for now
    df = interactions.merge(items, on="item_id", how="left")

    return df


def first_last_visit(df):
    """
    outputs table of first and last visits for user level assuming any kind of interaction
    """

    # finding minimum and max will find the first and last date
    visits = df.groupby(["user_id"]).agg({"interaction_time": [min, max]}).reset_index()
    visits.columns = ["user_id", "first_visit_date", "last_visit_date"]

    return visits


def avg_btwn_visits(df):

    # shift to get new column that indicates next interaction datetime, already sorted
    df["next_interaction"] = df.groupby(["user_id"])["interaction_time"].shift(-1)

    # find interval between current and next
    df["next_visit_interval"] = df["next_interaction"] - df["interaction_time"]

    # convert interval to hours
    df["next_visit_interval"] = df["next_visit_interval"].apply(
        lambda x: x.seconds / 3600
    )

    # aggregate intervals based on user id, there are NaN meaning no next record yet
    t_btwn = df.groupby("user_id")["next_visit_interval"].mean().reset_index()

    t_btwn.columns = ["user_id", "avg_gap_between_visits"]

    return t_btwn


def avg_m_spending(df, visits):

    # only interested in purchase
    purchases = df[df["interaction_type"] == "Purchase"]

    # aggregate to total purchased
    purchases = purchases.groupby(["user_id"])["price"].sum().reset_index()

    # it's important to know how many months have passed since first activity to calculate average spending
    # there will be months of 0 so they should be considered
    # assume latest observed date is Feb, normally it should be latest transaction month and year
    visits["months_since_first"] = visits["first_visit_date"].apply(
        lambda x: (2020 - x.year) * 12 + 2 - x.month + 1
    )

    # merge to visits
    purchases = purchases.merge(visits, on="user_id", how="left")

    # take the sum/n
    purchases["avg_monthly_spending"] = (
        purchases["price"] / purchases["months_since_first"]
    )

    return purchases[["user_id", "avg_monthly_spending"]]


def fav_brand(df):
    """
    - two kinds of metric, most frequent purchase, and most frequent view
    - if most frequent is not available then most frequent view
    """

    # split into two tables to groupby counting the purchases and views per user per brand
    fav_purchase = (
        df[df["interaction_type"] == "Purchase"]
        .groupby(["user_id", "brand"])["interaction_type"]
        .count()
        .reset_index()
        .sort_values(by=["user_id", "interaction_type"])
    )
    fav_view = (
        df[df["interaction_type"] == "ProductView"]
        .groupby(["user_id", "brand"])["interaction_type"]
        .count()
        .reset_index()
        .sort_values(by=["user_id", "interaction_type"])
    )

    # aggregate by choosing the highest count number brand
    fav_purchase = fav_purchase.groupby("user_id")["brand"].last().reset_index()
    fav_view = fav_view.groupby("user_id")["brand"].last().reset_index()

    # combine
    fav_brand = fav_view.merge(fav_purchase, on="user_id", how="outer")

    # coalesce on the two columns
    fav_brand["favorite_brand"] = fav_brand["brand_y"].combine_first(
        fav_brand["brand_x"]
    )

    return fav_brand[["user_id", "favorite_brand"]]

def main():
    
    interactions = interactions_input()
    items = items_input()
    df = joined_tb(interactions, items)
    visits = first_last_visit(df)
    abv = avg_btwn_visits(df)
    ams = avg_m_spending(df, visits)
    fb = fav_brand(df)
    
    results = visits.merge(abv, on = 'user_id', how = 'outer')
    results = results.merge(ams, on = 'user_id', how = 'outer')
    results = results.merge(fb, on = 'user_id', how = 'outer')
    
    return results

if __name__ == '__main__':
    
    result = main()
    result
    

(20692840, 5)
(54595, 2)


In [6]:
interactions = interactions_input()
items = items_input()
df = joined_tb(interactions, items)

(20692840, 5)
(54595, 2)


In [11]:
visits = first_last_visit(df)
abv = avg_btwn_visits(df)
ams = avg_m_spending(df, visits)
fb = fav_brand(df)


In [14]:
ams

Unnamed: 0,user_id,avg_monthly_spending
0,000148b5-6ec2-47a5-83ba,64.260000
1,0002c237-9155-4d19-a0b4,44.513333
2,0003fc81-ef96-4d4a-a567,3.380000
3,00056859-dca9-4a63-af7c,33.280000
4,00059bbf-0b42-43a4-ad77,7.300000
...,...,...
110513,fffca46c-4121-48de-bc62,2.528000
110514,fffda61e-f290-4f8e-b4f5,2.380000
110515,fffe9def-a623-41fd-bda4,32.300000
110516,fffebbf0-b111-47a5-b45f,7.522000


In [15]:
result.count()

user_id                   1639358
first_visit_date          1639358
last_visit_date           1639358
months_since_first        1639358
avg_gap_between_visits     869999
avg_monthly_spending       110518
favorite_brand            1603066
dtype: int64

In [18]:
result.describe()

Unnamed: 0,months_since_first,avg_gap_between_visits,avg_monthly_spending
count,1639358.0,869999.0,110518.0
mean,3.114745,2.457979,18.773999
std,1.442436,4.647417,29.9961
min,1.0,0.0,-10.52
25%,2.0,0.015397,5.105
50%,3.0,0.192778,10.31
75%,4.0,2.513889,21.099167
max,5.0,23.999444,1850.4


# there are many nulls for avg_monthly_spending but the count is right, there are not many purchases

In [16]:
result[result['avg_monthly_spending'].notnull()]

Unnamed: 0,user_id,first_visit_date,last_visit_date,months_since_first,avg_gap_between_visits,avg_monthly_spending,favorite_brand
28,000148b5-6ec2-47a5-83ba,2020-02-11 00:02:27,2020-02-29 22:00:56,1,0.665901,64.260000,Lymyso
63,0002c237-9155-4d19-a0b4,2019-12-01 11:42:37,2019-12-05 16:47:20,3,0.417942,44.513333,Zyzam
98,0003fc81-ef96-4d4a-a567,2019-12-02 11:32:54,2019-12-02 13:00:21,3,0.208214,3.380000,Yuzusaxay
127,00056859-dca9-4a63-af7c,2020-01-20 14:55:51,2020-02-28 14:06:06,2,0.246498,33.280000,Mykek
133,00059bbf-0b42-43a4-ad77,2020-01-10 20:37:13,2020-01-11 04:38:34,2,0.891389,7.300000,(not set)
...,...,...,...,...,...,...,...
1639275,fffca46c-4121-48de-bc62,2019-10-02 12:27:48,2019-10-03 16:21:15,5,0.961753,2.528000,Huk.uja
1639298,fffda61e-f290-4f8e-b4f5,2020-01-09 00:05:14,2020-01-09 00:15:52,2,0.029537,2.380000,M osor
1639324,fffe9def-a623-41fd-bda4,2020-01-03 20:43:08,2020-01-03 20:52:59,2,0.027361,32.300000,V.abob
1639326,fffebbf0-b111-47a5-b45f,2019-10-01 13:41:30,2019-12-07 16:45:12,5,1.702056,7.522000,Sokej


In [20]:
result[result['avg_gap_between_visits'].notnull()]

Unnamed: 0,user_id,first_visit_date,last_visit_date,months_since_first,avg_gap_between_visits,avg_monthly_spending,favorite_brand
0,00000a88-2beb-4f3a-bf8a,2019-12-16 09:15:42,2020-02-26 12:59:50,3,5.155648,,Mykek
1,00000cfa-9a75-4ee4-8ff0,2020-02-07 15:53:22,2020-02-29 04:19:54,1,6.221111,,Beb ip aq
2,00001957-83e2-4e36-aacf,2020-01-31 07:07:19,2020-02-11 15:06:17,2,2.332616,,(not set)
3,00001af7-43f5-4c04-be0e,2020-02-13 18:08:22,2020-02-20 16:16:13,1,0.962210,,(not set)
6,00004682-fc9a-4c33-a0e1,2019-10-02 09:26:10,2019-11-07 11:52:00,5,0.486111,,Peru
...,...,...,...,...,...,...,...
1639342,ffff3c2b-ae88-4467-8c54,2019-12-21 03:46:03,2019-12-21 03:47:03,3,0.016667,,(not set)
1639345,ffff4d6e-d176-47d9-922f,2019-11-29 16:30:42,2019-11-29 16:32:31,4,0.030278,,Tezuwiv
1639346,ffff6bd0-f885-48f1-b613,2019-10-16 20:28:48,2020-02-02 14:46:06,5,1.922197,10.392,Tydukipa
1639352,ffffac26-6ed5-4538-ad1d,2019-10-11 20:40:18,2019-10-11 20:43:28,5,0.007540,,(not set)
