Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, log_loss
from tqdm import tqdm
from datetime import datetime
from collections import Counter
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import re
%matplotlib inline

Setting paths

In [None]:
INPUT_PATH  = "/Users/saby/Projects/playground/so1/input"
OUTPUT_PATH = "/Users/saby/Projects/playground/so1/output"
TRAIN_FILE_NAME  = "train.csv"
OUTPUT_FILE_NAME = "submit_1.csv"

TRAIN_FILE  =  INPUT_PATH + "/" +  TRAIN_FILE_NAME
OUTPUT_FILE = OUTPUT_PATH + "/" + OUTPUT_FILE_NAME

Setting constants

In [None]:
N_USER = 2000
N_ITEM = 40

### Data frame handling functions

In [None]:
# Function to read csv to dataframe
def df_read(FILE):
    df = pd.read_csv(FILE)
    df.rename(columns={'i': 'user', 'j': 'item', 't': 'week'}, inplace=True)
    return df

# Function to get a subset of original dataframe
# based on some user or item ids or price thresholds
def df_filter(df, user=None, item=None, week=None, price=None, advertised=None,
                      price_up_thresh=None, price_low_thresh=None, week_up_thresh=None):
    bool_arr = np.array([[True for i in range(df.shape[0])]])
    if user is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['user'] == user]))
    if item is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['item'] == item]))
    if week is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['week'] == week]))
    if price is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['price'] == price]))
    if advertised is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['advertised'] == advertised]))
        
    if price_up_thresh is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['price'] <= price_up_thresh]))
    if price_low_thresh is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['price'] >= price_low_thresh]))
    if week_up_thresh is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['week'] <= week_up_thresh]))

    return df[bool_arr.transpose()]



In [None]:
df = df_read(TRAIN_FILE)
print(df.shape)
df.head()

In [None]:
df_filter(df, user=3, week=1, price_low_thresh=1.8)

### Evaluation functions

In [None]:
# Function to get true values for a given week
# to be used in evaluation
def get_y_true(df, week):
    df_week = df_filter(df, week=1)
    y_true = np.zeros((N_USER, N_ITEM), 'float')
    for index, row in df_week.iterrows():
        y_true[int(row['user']), int(row['item'])] = 1
    return y_true

# Function to evaluate the prediction for a given week against ground truth value for that week
# You can also use mnetrics like auc, precision, recall, f-score
def evaluation_score(df, week, y_predict, metric='auc'):
    y_true = get_y_true(df, week)
    y_true = y_true.flatten()
    y_predict = y_predict.flatten()
    if metric == 'auc':
        score = roc_auc_score(y_true, y_predict)
    elif metric == 'precision':
        score = precision_score(y_true, y_predict)
    elif metric == 'recall':
        score = recall_score(y_true, y_predict)
    elif metric == 'f1':
        score = f1_score(y_true, y_predict)
    elif metric == 'log loss':
        score = log_loss(y_true, y_predict)
    elif metric == 'accuracy':
        score = accuracy_score(y_true, y_predict)
    else:
        assert False
    return score

# Function to get evaluation scores for a given week whne all predictions are set as ZERO
def baseline_zeros(df, week, metric):
    y_predict = np.zeros((N_USER, N_ITEM), 'float')
    return evaluation_score(df, week=week, y_predict=y_predict, metric=metric)

# Function to get evaluation scores for a given week whne all predictions are set as ONE
def baseline_ones(df, week, metric):
    y_predict = np.ones((N_USER, N_ITEM), 'float')
    return evaluation_score(df, week=week, y_predict=y_predict, metric=metric)



In [None]:
baseline_ones(df, week=0, metric='precision')

### Feature engineering

Feature engg handling functions

In [None]:
# Function to return an empty dataframe of length n
def df_init(n):
    return pd.DataFrame(index=range(n))

# Function to add a new column array into dataframe for given label
# If the column with label already exist, it simply overwrites on existing label
def df_add_column(df, label, arr):
    df[label] = pd.Series(np.array(arr), index=df.index)
    return df

# Function to del a new column array into dataframe for given label
def df_del_column(df, label):
    df = df.drop(columns=[label])
    return df

In [None]:
df_user      = df_init(N_USER)
df_item      = df_init(N_ITEM)
df_user_item = df_init(N_USER*N_ITEM)

In [None]:
# week : current week in considereation
#        next week is for which we want a prediction
# pass week+1 to evaluation score

# for week in range(46,1,-5):   # features useful for test
for week in range(45,1,-5):   # features useful for train
    df_curr_week  = df_filter(df, week=week)
    df_past_weeks = df_filter(df, week_up_thresh=week)
    ############
    print('Generating user features: will take at least ' + str(week) + ' mins...')
    gc.collect()

    label = 'USR_ID_user'
    arr = np.arange(N_USER)
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_CURR_n_purchasing_user'
    n_purchasing_user = len(list(set(list(df_curr_week['user']))))
    arr = np.array([n_purchasing_user for i in range(N_USER)])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_CURR_n_item'
    arr = np.array([df_filter(df_curr_week, user=i).shape[0] for i in range(N_USER)])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_CURR_total_money_spent'
    arr = np.nan_to_num(np.array([df_filter(df_curr_week, user=i)['price'].sum() for i in range(N_USER)]))
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_CURR_ad_worked_ratio'
    arr = np.zeros((N_USER), 'float')
    for i in range(N_USER):
        if df_filter(df_curr_week, user=i).shape[0] > 0:
            arr[i] = df_filter(df_curr_week, user=i)['advertised'].sum() / float(df_filter(df_curr_week, user=i).shape[0])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_CURR_avg_money_per_item'
    arr = np.zeros((N_USER), 'float')
    for i in range(N_USER):
        if df_filter(df_curr_week, user=i).shape[0] > 0:
            arr[i] = df_filter(df_curr_week, user=i)['price'].sum() / float(df_filter(df_curr_week, user=i).shape[0])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_CURR_avg_reorder_length'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        array = np.sort(np.array(list(set(list(df_filter(df_past_weeks, user=user)['week'])))))
        arr[user] = np.array([array[i]-array[i-1] for i in range(len(array)) if i != 0]).mean()
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_CURR_reorder_length'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        array = np.sort(np.array(list(set(list(df_filter(df_past_weeks, user=user)['week'])))))
        if array[-1] != week:
            arr[user] = week - array[-1]
        else:
            arr[user] = week - array[-2]
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_n_purchasing_user'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        array = np.array(list(set(list(df_filter(df_past_weeks, user=user)['week']))))
        arr[user] = np.array([len(list(set(list(df_filter(df, week=i)['user'])))) for i in array]).mean()
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_n_item'
    arr = np.array([df_filter(df_past_weeks, user=i).shape[0] for i in range(N_USER)])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_PAST_n_time'
    arr = np.array([len(list(set(list(df_filter(df_past_weeks, user=i)['week'])))) for i in range(N_USER)])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_n_item_per_n_time'
    arr = np.zeros((N_USER), 'float')
    for i in range(N_USER):
        n_item = float(df_filter(df_past_weeks, user=i).shape[0])
        n_time = len(list(set(list(df_filter(df_past_weeks, user=i)['week']))))
        if n_time > 0:
            arr[i] = n_item / n_time
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_old_purchase_ratio'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        array = np.array(df_filter(df_past_weeks, user=user)['item'])
        n_reordered = np.array([(count-1) for item, count in Counter(array).items() if count > 1]).sum() # gives reordered count
        arr[user] = float(n_reordered) / array.shape[0]
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_CURR_old_purchase_ratio'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        array = np.array(df_filter(df_past_weeks, user=user)['item'])   
        array1 = np.array([item for item, count in Counter(array).items() if count > 1])  # find which all items repeated
        n_reordered = 0.0
        for i in np.array(list(df_filter(df_curr_week, user=user)['item'])):  # any current week item in repeated item?
            if i in array1:
                n_reordered += 1.0
        arr[user] = float(n_reordered) / df_filter(df_curr_week, user=0).shape[0]
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_ad_worked_ratio'
    arr = np.zeros((N_USER), 'float')
    for i in range(N_USER):
        if df_filter(df_past_weeks, user=i).shape[0] > 0:
            arr[i] = df_filter(df_past_weeks, user=i)['advertised'].sum() / float(df_filter(df_past_weeks, user=i).shape[0])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_PAST_total_money_spent'
    arr = np.array([df_filter(df_past_weeks, user=i)['price'].sum() for i in range(N_USER)])
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_avg_money_per_item'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        total_money_spent = df_filter(df_past_weeks, user=user)['price'].sum()
        n_item = float(df_filter(df_past_weeks, user=user).shape[0])
        if n_item > 0:
            arr[user] = total_money_spent / n_item
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_avg_money_per_n_time'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        total_money_spent = df_filter(df_past_weeks, user=user)['price'].sum()
        n_time = float(len(list(set(list(df_filter(df_past_weeks, user=i)['week'])))))
        if n_time > 0:
            arr[user] = total_money_spent / n_time
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    label = 'USR_PAST_avg_money_per_week'
    arr = np.zeros((N_USER), 'float')
    for user in range(N_USER):
        total_money_spent = df_filter(df_past_weeks, user=user)['price'].sum()
        if week+1 > 0:
            arr[user] = total_money_spent / float(week+1)
    df_user = df_add_column(df_user, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    name = 'df_user_' + str(week) + '.csv'
    df_user.to_csv(name, index=False)

    gc.collect()
    
    ##############
    print('Generating item features: will take at least ' + str(week) + ' mins...')
    gc.collect()

    label = 'ITM_ID_item'
    arr = np.arange(N_ITEM)
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    label = 'ITM_ID_price'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        if not df_filter(df_past_weeks, item=item).empty:          # item exists
            price = df_filter(df_past_weeks, item=item, advertised=0)['price'].iloc[-1]
        else:                                                      # item doesn't exist
            assert False
        arr[item] = price
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    label = 'ITM_ID_disc_price'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        if not df_filter(df_past_weeks, item=item, advertised=1)['price'].empty: # item exists and advertised before
            discount_price = df_filter(df_past_weeks, item=item, advertised=1)['price'].iloc[-1]
        elif not df_filter(df_past_weeks, item=item).empty:          # item not advertised but exists
            discount_price = df_filter(df_past_weeks, item=item, advertised=0)['price'].iloc[-1]
        else:                                                    # item not advertised and doesnt exist
            assert False
        arr[item] = discount_price
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    label = 'ITM_ID_advertised'
    arr = np.array([int(not df_filter(df_curr_week, item=item, advertised=1).empty) for item in range(N_ITEM)])
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    label = 'ITM_CURR_n_user_buying'
    arr = np.array([df_filter(df_curr_week, item=item).shape[0] for item in range(N_ITEM)])
    # need not use set as only instance of user-item purchase is reported
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'ITM_PAST_ad_count'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        if not df_filter(df_past_weeks, item=item, advertised=1)['price'].empty: # item exists and advertised before
            ad_count = len(list(set(df_filter(df_past_weeks, item=item, advertised=1)['week'])))
        else:
            ad_count = 0
        arr[item] = ad_count
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    label = 'ITM_PAST_ad_worked_ratio'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        if not df_filter(df_past_weeks, item=item, advertised=1)['price'].empty: # item exists and advertised before
            when_advertised = np.array([df_filter(df_past_weeks, week=that_week, item=item).shape[0] \
                                        for that_week in list(set(df_filter(df_past_weeks, item=item, advertised=1)['week']))]).mean()
            when_not_advertised = np.array([df_filter(df_past_weeks, week=that_week, item=item).shape[0] \
                                            for that_week in list(set(df_filter(df_past_weeks, item=item, advertised=0)['week']))]).mean()
            ad_work_ratio = when_advertised / float(when_advertised + when_not_advertised)
        else:
            ad_work_ratio = 0
        arr[item] = ad_work_ratio
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    label = 'ITM_PAST_n_user_buying'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        arr[item] = np.array([df_filter(df_past_weeks, week=that_week, item=item).shape[0] \
                                        for that_week in list(set(df_filter(df_past_weeks, item=item)['week']))]).mean()
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    label = 'ITM_PAST_old_reorder_freq'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        array = np.array(list(df_filter(df_past_weeks, item=item)['user']))
        arr[item] = np.array([(count-1) for i, count in Counter(array).items() if count > 1]).mean()
    df_item = df_add_column(df_item, label, arr)          # Verify for correctness
    print(str(label) + ' done!')

    label = 'ITM_PAST_old_purchase_ratio'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        array = np.array(df_filter(df_past_weeks, item=item)['user'])
        n_reordered = np.array([(count-1) for i, count in Counter(array).items() if count > 1]).sum() # gives reordered count
        arr[item] = float(n_reordered) / array.shape[0]
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'ITM_PAST_avg_reorder_length'
    arr = np.zeros((N_ITEM), 'float')
    for item in range(N_ITEM):
        array = np.array(list(set(df_filter(df_past_weeks, item=item)['user']))) # list of all unique users who ordered item
        avg_length = 0
        for that_user in array:
            array1 = np.array(list(df_filter(df_past_weeks, item=item, user=that_user)['week']))
            if len(array1) > 1:
                avg_length += np.array([array1[i]-array1[i-1] for i in range(len(array1)) if i != 0]).mean()
        if array.shape[0] > 0:
            arr[item] = float(avg_length) / array.shape[0]
    df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    #Generating item-item co-occurence matrix
    # Diagonal elements: when it did not co occur
    # Non-diagonal elements: when it co occurred
    co_occur = np.zeros((N_ITEM,N_ITEM), 'float')
    for user in range(N_USER):
        array = np.array(list(df_filter(df_past_weeks, user=user)['week']))
        multiple_order_week = np.array([i for i, count in Counter(array).items() if count > 1])
        for that_week in multiple_order_week:
            order = np.array(df_filter(df_past_weeks, user=user, week=that_week)['item'])
            for a in order:
                for b in order:
                    if a <= b:
                        continue
                    co_occur[a,b] += 1
                    co_occur[b,a] += 1
        ##### should this be turned off or not? check later
        single_order_week = np.array([i for i, count in Counter(array).items() if count == 1])
        for that_week in single_order_week:
            item = int(df_filter(df_past_weeks, user=user, week=that_week)['item'])
            co_occur[item,item]  += 1

    gc.collect()

    for item in range(N_ITEM):
        label = 'ITM_PAST_co_occur_' + str(item)
        arr = co_occur[item]
        df_item = df_add_column(df_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    name = 'df_item_' + str(week) + '.csv'
    df_item.to_csv(name, index=False)

    gc.collect()
    
    ###############
    print('Generating user-item features: will take at least ' + str(week) + ' mins...')
    label = 'USR_ITM_CURR_user'
    arr = np.zeros((N_USER * N_ITEM), 'int')
    for user in range(N_USER):
        for item in range(N_ITEM):
            arr[(user*N_ITEM) + item] = user
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_ITM_CURR_item'
    arr = np.zeros((N_USER * N_ITEM), 'int')
    for user in range(N_USER):
        for item in range(N_ITEM):
            arr[(user*N_ITEM) + item] = item
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_ITM_CURR_buy'
    arr = np.zeros((N_USER * N_ITEM), 'int')
    for index, row in df_curr_week.iterrows():
        user = int(row['user'])
        item = int(row['item'])
        arr[(user*N_ITEM) + item] = 1
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_ITM_PAST_past_order_count'
    arr = np.zeros((N_USER * N_ITEM), 'int')
    for index, row in df_past_weeks.iterrows():
        user = int(row['user'])
        item = int(row['item'])
        arr[(user*N_ITEM) + item] += 1
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_ITM_PAST_last_reorder_length'
    arr = np.zeros((N_USER * N_ITEM), 'int')
    for index, row in df_past_weeks.iterrows():
        user = int(row['user'])
        item = int(row['item'])
        filtered_df = df_filter(df_past_weeks, user=user, item=item)['week']
        if not filtered_df.empty:
            array = np.sort(np.array(list(filtered_df)))
            if array[-1] != week:
                arr[(user*N_ITEM) + item] = week - array[-1]
            elif len(array) > 1:
                arr[(user*N_ITEM) + item] = week - array[-2]
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    label = 'USR_ITM_PAST_avg_reorder_length'
    arr = np.zeros((N_USER * N_ITEM), 'float')
    for index, row in df_curr_week.iterrows():
        user = int(row['user'])
        item = int(row['item'])
        filtered_df = df_filter(df_past_weeks, user=user, item=item)['week']
        if not filtered_df.empty:
            array = np.sort(np.array(filtered_df))
            if len(array) > 1:
                arr[(user*N_ITEM) + item] = np.array([float(array[i]-array[i-1]) for i in range(len(array)) if i != 0]).mean()
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    print('Generating time features: will take at least 1 min...')
    label = 'NEXT_week_id'
    arr = np.zeros((N_USER * N_ITEM), 'int')
    for user in range(N_USER):
        for item in range(N_ITEM):
            arr[(user*N_ITEM) + item] = week+1
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    label = 'NEXT_advertised'
    arr = np.zeros((N_USER * N_ITEM), 'int')
    ad_list = list(set(df_filter(df, week=week+1, advertised=1)['item']))
    for user in range(N_USER):
        for item in range(N_ITEM):
            if item in ad_list:
                arr[(user*N_ITEM) + item] = 1
    df_user_item = df_add_column(df_user_item, label, arr)
    print(str(label) + ' done!')

    gc.collect()

    name = 'df_user_item_' + str(week) + '.csv'
    df_user_item.to_csv(name, index=False)

    gc.collect()