Imports

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, log_loss
from tqdm import tqdm
from datetime import datetime
from collections import Counter
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import re
%matplotlib inline

Setting paths

In [2]:
INPUT_PATH  = "/Users/saby/Projects/playground/so1/input"
OUTPUT_PATH = "/Users/saby/Projects/playground/so1/output"
FEATURE_PATH = "/Users/saby/Projects/playground/so1/features"
TRAIN_FILE_NAME  = "train.csv"
OUTPUT_FILE_NAME = "submit_1.csv"
FEATURE_FULL_FILE_NAME = "df_full.csv"


FEATURE_USER_PREFIX = FEATURE_PATH + "/" + "df_user_"
FEATURE_ITEM_PREFIX = FEATURE_PATH + "/" + "df_item_"
FEATURE_USIT_PREFIX = FEATURE_PATH + "/" + "df_user_item_"

TRAIN_FILE  =  INPUT_PATH + "/" +  TRAIN_FILE_NAME
OUTPUT_FILE = OUTPUT_PATH + "/" + OUTPUT_FILE_NAME
FEATURE_FULL_FILE = FEATURE_PATH + "/" + FEATURE_FULL_FILE_NAME

Setting constants

In [3]:
N_USER = 2000
N_ITEM = 40

In [4]:
VALIDATION=True
SPLIT_SIZE=0.2

### Data frame handling functions

In [5]:
# Function to read csv to dataframe
def df_read(FILE):
    df = pd.read_csv(FILE)
    df.rename(columns={'i': 'user', 'j': 'item', 't': 'week'}, inplace=True)
    return df

# Function to get a subset of original dataframe
# based on some user or item ids or price thresholds
def df_filter(df, user=None, item=None, week=None, price=None, advertised=None,
                      price_up_thresh=None, price_low_thresh=None, week_up_thresh=None):
    bool_arr = np.array([[True for i in range(df.shape[0])]])
    if user is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['user'] == user]))
    if item is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['item'] == item]))
    if week is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['week'] == week]))
    if price is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['price'] == price]))
    if advertised is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['advertised'] == advertised]))
        
    if price_up_thresh is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['price'] <= price_up_thresh]))
    if price_low_thresh is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['price'] >= price_low_thresh]))
    if week_up_thresh is not None:
        bool_arr = np.multiply(bool_arr, np.array([df['week'] <= week_up_thresh]))

    return df[bool_arr.transpose()]



### Evaluation functions

In [6]:
# Function to get true values for a given week
# to be used in evaluation
def get_y_true(df, week):
    df_week = df_filter(df, week=week)
    y_true = np.zeros((N_USER, N_ITEM), 'float')
    for index, row in df_week.iterrows():
        y_true[int(row['user']), int(row['item'])] = 1
    return y_true

# Function to evaluate the prediction for a given week against ground truth value for that week
# You can also use mnetrics like auc, precision, recall, f-score
def evaluation_score(df, week, y_predict, metric='auc'):
    y_true = get_y_true(df, week)
    y_true = y_true.flatten()
    y_predict = y_predict.flatten()
    if metric == 'auc':
        score = roc_auc_score(y_true, y_predict)
    elif metric == 'precision':
        score = precision_score(y_true, y_predict)
    elif metric == 'recall':
        score = recall_score(y_true, y_predict)
    elif metric == 'f1':
        score = f1_score(y_true, y_predict)
    elif metric == 'log loss':
        score = log_loss(y_true, y_predict)
    elif metric == 'accuracy':
        score = accuracy_score(y_true, y_predict)
    else:
        assert False
    return score

# Function to get evaluation scores for a given week whne all predictions are set as ZERO
def baseline_zeros(df, week, metric):
    y_predict = np.zeros((N_USER, N_ITEM), 'float')
    return evaluation_score(df, week=week, y_predict=y_predict, metric=metric)

# Function to get evaluation scores for a given week whne all predictions are set as ONE
def baseline_ones(df, week, metric):
    y_predict = np.ones((N_USER, N_ITEM), 'float')
    return evaluation_score(df, week=week, y_predict=y_predict, metric=metric)

In [7]:
#baseline_ones(df, week=0, metric='precision')

### Feature engineering

Feature engg handling functions

In [8]:
# Function to return an empty dataframe of length n
def df_init(n):
    return pd.DataFrame(index=range(n))

# Function to add a new column array into dataframe for given label
# If the column with label already exist, it simply overwrites on existing label
def df_add_column(df, label, arr):
    df[label] = pd.Series(np.array(arr), index=df.index)
    return df

# Function to del a new column array into dataframe for given label
def df_del_column(df, label):
    df = df.drop(columns=[label])
    return df

### Model training begins here

In [9]:
df = df_read(TRAIN_FILE)
print(df.shape)
df.head()

(76502, 5)


Unnamed: 0,user,item,week,price,advertised
0,4,7,0,2.137451,0
1,6,1,0,0.863341,0
2,8,6,0,0.799155,0
3,8,25,0,3.023893,0
4,9,6,0,0.799155,0


Lets read some saved features

In [10]:
# Function that reads week-wise features from set path and returns a merged dataframe
# No need to sort along user and item, the returned dataaframe will be sorted
def get_feature_df(df, week):
    USER_FILE = FEATURE_USER_PREFIX + str(week) + '.csv'
    ITEM_FILE = FEATURE_ITEM_PREFIX + str(week) + '.csv'
    USIT_FILE = FEATURE_USIT_PREFIX + str(week) + '.csv'

    df_user = df_read(USER_FILE)
    df_item = df_read(ITEM_FILE)
    df_user_item = df_read(USIT_FILE)

    df_user.rename(columns={'USR_ID_user': 'USR_ITM_CURR_user'}, inplace=True)
    df_item.rename(columns={'ITM_ID_item': 'USR_ITM_CURR_item'}, inplace=True)

    # df_merged = user_item_df((indexed on)) + item_df
    df_merged = pd.merge(left=df_user_item, right=df_item, on='USR_ITM_CURR_item', how='inner')
    # df_merged = df_merged((indexed on)) + user_df
    df_merged = pd.merge(left=df_merged, right=df_user, on='USR_ITM_CURR_user', how='inner')
    
    del df_user, df_item, df_user_item
    gc.collect()
    
    #if week == 48:
    #    return df_merged
    #adding final output class for next week
    label = 'PREDICT'
    arr = get_y_true(df, week=week+1).flatten()
    df_merged = df_add_column(df_merged, label, arr)
    
    return df_merged

# Function to read the full feature while which is a collection of features from all weeks
# Basically reading a csv which is a concatenate of all dataframes generated using above function
def get_full_feature_df():
    df = df_read(FEATURE_FULL_FILE)
    gc.collect()
    return df

In [11]:
df_full = get_full_feature_df()
df_full.head()

Unnamed: 0,USR_ITM_CURR_user,USR_ITM_CURR_item,USR_ITM_CURR_buy,USR_ITM_PAST_past_order_count,USR_ITM_PAST_last_reorder_length,USR_ITM_PAST_avg_reorder_length,NEXT_week_id,NEXT_advertised,ITM_ID_price,ITM_ID_disc_price,...,USR_PAST_n_time,USR_PAST_n_item_per_n_time,USR_PAST_old_purchase_ratio,USR_CURR_old_purchase_ratio,USR_PAST_ad_worked_ratio,USR_PAST_total_money_spent,USR_PAST_avg_money_per_item,USR_PAST_avg_money_per_n_time,USR_PAST_avg_money_per_week,PREDICT
0,0,0,0,0,0,0.0,2,0,1.717944,1.717944,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0.0,2,0,0.863341,0.863341,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,0,0,0,0.0,2,0,3.36606,3.36606,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,3,0,0,0,0.0,2,0,0.699985,0.699985,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,4,0,0,0,0.0,2,1,2.01219,2.01219,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


create train, test split and training and predicting labels

In [12]:
if VALIDATION:
    df_train, df_valid = train_test_split(df_full, test_size=SPLIT_SIZE)
else:
    df_train = df_full

X_train = df_train.drop(['PREDICT'], axis=1).values
y_train = np.array(df_train["PREDICT"])
del df_full, df_train
gc.collect()

if VALIDATION:
    X_valid = df_valid.drop(['PREDICT'], axis=1).values
    y_valid = np.array(df_valid["PREDICT"])
    del df_valid
    gc.collect()

print('train valid set ready for training')

Train: Using xgboost

In [None]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.4
params['max_depth'] = 10     # set it as 3 or 4

d_train = xgb.DMatrix(X_train, label=y_train)
if VALIDATION:
    d_valid = xgb.DMatrix(X_valid, label=y_valid)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
else:
    watchlist = [(d_train, 'train')]

gbm = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=10, verbose_eval=1)
gc.collect()

[0]	train-auc:0.812112	valid-auc:0.809993
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[1]	train-auc:0.815725	valid-auc:0.812342
[2]	train-auc:0.817268	valid-auc:0.813109
[3]	train-auc:0.819185	valid-auc:0.813053
[4]	train-auc:0.821136	valid-auc:0.81381
[5]	train-auc:0.823166	valid-auc:0.814045
[6]	train-auc:0.825136	valid-auc:0.814014
[7]	train-auc:0.827042	valid-auc:0.813911
[8]	train-auc:0.829303	valid-auc:0.813914
[9]	train-auc:0.831725	valid-auc:0.813717
[10]	train-auc:0.834142	valid-auc:0.813515
[11]	train-auc:0.83575	valid-auc:0.813264
[12]	train-auc:0.837439	valid-auc:0.813086
[13]	train-auc:0.839225	valid-auc:0.812901
[14]	train-auc:0.841015	valid-auc:0.812397


[37]	train-auc:0.820113	valid-auc:0.815991 for eta about 0.4 and depth at 6

### Getting testing predictions based on past training

In [None]:
# test_week=week+2
# df_test = get_feature_df(df, week=test_week)
# X_test = df_test.drop(['PREDICT'], axis=1).values
# y_test = np.array(df_test["PREDICT"])

# pred = np.array(gbm.predict(xgb.DMatrix(X_test)))
# roc_auc_score(y_test, pred)

Save model

In [None]:
#gbm.save_model('simple_model-test-full.model')

## Getting predictions on final week test data

In [None]:
if VALIDATION:
    print("ALERT!!!!!!!!!!!")
    print("YOU ARE TRYING TO GENERATE PREDICTIONS IN VALIDATION MODE")
    print("PLEASE SET VALIDATION=False AND TRY AGAIN")
    # don't go beyond here with Run All
    assert False

Generating submission output

In [None]:
df_test = get_feature_df(df, week=48)
X_test = df_test.values
pred = np.array(gbm.predict(xgb.DMatrix(X_test)))

sub = pd.DataFrame()
sub['CLASS'] = p_test
sub.to_csv('simple_model-test-single-all.csv', index=True)