In [5]:
import csv
import datetime
import random
from operator import sub
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing, ensemble

In [6]:
mapping_dict = {'Gender':{'male':0, 'female':1, 'no_gender':2},
                'State':{'DELHI':0, 'WEST BENGAL':1, 'UTTAR PRADESH':2, 'MAHARASHTRA':3, 'TAMIL NADU':4, 'PUNJAB':5,
                         'JHARKHAND':6, 'KARNATAKA':7, 'no_state':8, 'MADHYA PRADESH':9}}

In [7]:
cat_cols = list(mapping_dict.keys())

In [8]:
target_cols = ['108000707', '108005676', '108037568', '108100290', '108100294',
       '108100306', '108100308', '108100352', '108100362', '108100382',
       '300070102', '300111517', '300481673', '300776409', '300776410',
       '300776411', '300781593', '300785148', '300785150', '300840018']

In [9]:
def getTarget(row):
    tlist = []
    for col in target_cols:
        if row[col].strip() in ['', 'NA']:
            target = 0
        else:
            target = int(float(row[col]))
        tlist.append(target)
    return tlist

In [10]:
def getIndex(row, col):
    val = row[col].strip()
    if val not in ['','NA']:
        ind = mapping_dict[col][val]
    else:
        ind = mapping_dict[col][-99]
    return ind

In [11]:
def getMonth(row):
    return int(row['transactionDate'].split('/')[0])

In [16]:
def processDataMK(in_file_name, cust_dict, lag_cust_dict):
    x_vars_list = []
    y_vars_list = []
    with open(in_file_name, 'r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            
            if row['transactionDate'] not in ['4/28/2016', '5/28/2016', '6/28/2016', '4/28/2017', '5/28/2017', '6/28/2017']:
                continue
            #Leave out first month
            cust_id = (row['customerID'])
            if (row['transactionDate'] in ['4/28/2016', '4/28/2017'] ):
                target_list = getTarget(row)
                lag_cust_dict[cust_id] =  target_list[:]
                continue

            if (row['transactionDate'] in ['5/28/2016', '5/28/2016'] ):
                target_list = getTarget(row)
                cust_dict[cust_id] =  target_list[:]
                continue

            x_vars = []
            for col in cat_cols:
                x_vars.append( getIndex(row, col) )
            sex = getIndex(row, 'Gender')
            state = getIndex(row, 'State')
            x_vars.append(getMonth(row))
            #x_vars.append(getMarriageIndex(row, age, sex, income) )
            if row['transactionDate'] == '6/28/2017':
                prev_target_list = cust_dict.get(cust_id, [0]*20)
                lag_target_list = lag_cust_dict.get(cust_id, [0]*20)
                x_vars_list.append(x_vars + prev_target_list + lag_target_list)
            elif row['transactionDate'] == '6/28/2016':
                prev_target_list = cust_dict.get(cust_id, [0]*20)
                lag_target_list = lag_cust_dict.get(cust_id, [0]*20)
                target_list = getTarget(row)
                new_products = [max(x1 - x2,0) for (x1, x2) in zip(target_list, prev_target_list)]
                if sum(new_products) > 0:
                    for ind, prod in enumerate(new_products):
                        if prod>0:
                            assert len(prev_target_list) == 20
                            x_vars_list.append(x_vars+prev_target_list+lag_target_list)
                            y_vars_list.append(ind)


        return x_vars_list, y_vars_list, cust_dict, lag_cust_dict

In [17]:
def processData(in_file_name, cust_dict):
    x_vars_list = []
    y_vars_list = []
    for row in csv.DictReader(in_file_name):
        # use only the four months as specified by breakfastpirate #
        if row['transactionDate'] not in ['5/28/2016', '6/28/2016', '5/28/2017', '6/28/2017']:
            continue

        cust_id = int(row['customerID'])
        if row['transactionDate'] in ['5/28/2016', '5/28/2017']:
            target_list = getTarget(row)
            cust_dict[cust_id] =  target_list[:]
            continue
        
        x_vars = []
        for col in cat_cols:
            x_vars.append( getIndex(row, col) )
        sex = getIndex(row, 'Gender')
        state = getIndex(row, 'State')
        #x_vars.append( getMarriageIndex(row, age, sex, income) )

        if row['transactionDate'] == '6/28/2017':
            prev_target_list = cust_dict.get(cust_id, [0]*20)
            x_vars_list.append(x_vars + prev_target_list)
        elif row['transactionDate'] == '6/28/2016':
            prev_target_list = cust_dict.get(cust_id, [0]*20)
            target_list = getTarget(row)
            new_products = [max(x1 - x2,0) for (x1, x2) in zip(target_list, prev_target_list)]
            if sum(new_products) > 0:
                for ind, prod in enumerate(new_products):
                    if prod>0:
                        assert len(prev_target_list) == 20
                        x_vars_list.append(x_vars+prev_target_list)
                        y_vars_list.append(ind)

    return x_vars_list, y_vars_list, cust_dict

In [34]:
def runXGB(train_X, train_y, seed_val=42):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.16
    param['max_depth'] = 5
    param['silent'] = 1
    param['num_class'] = 24
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 12
    param['subsample'] = 0.85
    param['colsample_bytree'] = 0.9
    param['seed'] = seed_val
    num_rounds = 70

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)
    model = xgb.train(plst, xgtrain, num_rounds)	
    return model

In [35]:
if __name__ == "__main__":
    start_time = datetime.datetime.now()
    #data_path = "../input/"
    train_file = ("train_file_new.csv")
    print('Starting file processing')
    #x_vars_list, y_vars_list, cust_dict = processData(train_file, {})
    x_vars_list, y_vars_list, cust_dict, lag_cust_dict = processDataMK(train_file, {}, {})
    print('Finished file processing')
    train_X = np.array(x_vars_list)
    train_y = np.array(y_vars_list)
    print(np.unique(train_y))
    del x_vars_list, y_vars_list
    #train_file.close()
    print(train_X.shape, train_y.shape)
    print(datetime.datetime.now()-start_time)
    test_file = ("test_file_new.csv")
    x_vars_list, y_vars_list, cust_dict, lag_cust_dict = processDataMK(test_file, cust_dict, lag_cust_dict)
    test_X = np.array(x_vars_list)
    del x_vars_list
    #test_file.close()
    print(test_X.shape)
    print(datetime.datetime.now()-start_time)
    print(train_y)
    print("Building model..")
    model = runXGB(train_X, train_y, seed_val=0)
    del train_X, train_y
    print("Predicting..")
    xgtest = xgb.DMatrix(test_X)
    preds = model.predict(xgtest)
    del test_X, xgtest
    print(datetime.datetime.now()-start_time)

    print("Getting the top products..")
    test_id = np.array(pd.read_csv("sampleSubmission.csv", usecols=['customerID'])['customerID'])
    new_products = []
    for i, idx in enumerate(test_id):
        new_products.append([max(x1 - x2,0) for (x1, x2) in zip(preds[i,:], cust_dict[idx])])
    target_cols = np.array(target_cols)
    preds = np.argsort(np.array(new_products), axis=1)
    preds = np.fliplr(preds)[:,:7]
    final_preds = [" ".join(list(target_cols[pred])) for pred in preds]
    out_df = pd.DataFrame({'customerID':test_id, 'added_products':final_preds})
    out_df.to_csv('sub_xgb_new.csv', index=False)
    print(datetime.datetime.now()-start_time)

Starting file processing
Finished file processing
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
(2700, 43) (542,)
0:00:02.980572
(534, 43)
0:00:03.744083
[ 8 10  9  7  6  3  5 14  7  6 10  9  6  3 15 14 10  9  6  5 17  8 10 11  9
  5 14  8 11  9 15  5 14 17  7 17  2 15  1 14  9  8 10 11  9  3 15  5  2  4
 14 13 10 15  3 15 13  0 15  2  7  9  1  9 17  8  6 14 19 17 15 18  2  1 14
  8 10  9  7 11  6  3 18 18  2 12  1 17 17 14  7  4  8 11 10  9  6 15  5 14
  8  7 10  9 11  6  3  4 17  7  3  5  4 14  9 15  4 15 15 14 14  4 14 19 15
 14 10 15 14 15 18 14 14 17 13  8  9  6  3  4 17 18  3  5 17 14  8  9  7 11
  6  1 18  5  2 14  8 10 11  9  6 12 19 12 17 17 18  1  6 11 17 19 19 14  2
 14 14  5  8 11  9 19 17 17 12 19 17 17  6  3 15 14 14 14 13 15 14 14 17  8
  7  9  5  4 13 19 17 18  9  6  5  4 15 19 14 13 13  2  1  4 14  8 11  3 19
 19 17  7  9 18  2 15 15  3  4 12 19 16 16 16  7  3  5 16 16 15 13 16  9  6
  3  5 16  4 16  9 16 17 16 16 16 16 16 13  8  7 10  9 11  3 15  5 16 

XGBoostError: b'[14:19:20] C:\\xgboost\\src\\objective\\multiclass_obj.cc:43: Check failed: preds.size() == (static_cast<size_t>(param_.num_class) * info.labels.size()) SoftmaxMultiClassObj: label size and pred size does not match'

In [None]:
train_file =  open("all_clean_new.csv")
for row in csv.DictReader(train_file):
    if row['transactionDate'] in ['4/28/2016', '5/28/2016', '6/28/2016', '4/28/2017', '5/28/2017', '6/28/2017']:
        print(row['transactionDate'])