In [6]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt 
from tqdm import tqdm
from sklearn.metrics import f1_score
import os 
import lightgbm as lgb
import gc

# Prep stage 

In [7]:
model_name = 'stack_1'
MainPth = 'C:/Users/denis/Machine_Learning_Competitions/indian_hack/'
ModelFolder = MainPth + model_name + '/'

In [8]:
def check_empy(pth):
    if ( os.path.exists(pth) ):
        if (len(os.listdir(pth)) > 0):
            raise Exception('Folder is not empy!')
        else:
            pass

In [9]:
train_pth = ModelFolder + '/train/'
test_pth = ModelFolder + '/test/'

scores = []

# check if files in folder - no re-writing is good 
check_empy(train_pth)
check_empy(test_pth)

# create folder if it does not exist 
if not os.path.exists(train_pth):
    os.makedirs(train_pth)
    
if not os.path.exists(test_pth):
    os.makedirs(test_pth)

# Read data 

In [10]:
data_pth = MainPth + 'data/'

In [11]:
train = pd.read_csv(data_pth + 'train.csv')
train = train.sort_values(by='DateTime').reset_index(drop=True)

test = pd.read_csv(data_pth + 'test.csv')
ssub = pd.read_csv(data_pth + 'sample_submission.csv')
history = pd.read_csv(data_pth + 'historical_user_logs.csv')

train['session_id'] = np.arange(len(train))
test['session_id'] = np.arange(len(test)) + len(train)

In [12]:
np.min(train['DateTime']), np.min(test['DateTime']), np.min(history['DateTime'])

('2017-07-02 00:00', '2017-07-08 00:00', '2017-05-28 15:44')

In [13]:
np.max(train['DateTime']), np.max(test['DateTime']), np.max(history['DateTime'])

('2017-07-07 23:59', '2017-07-09 21:29', '2017-07-01 23:59')

In [14]:
train.head()

Unnamed: 0,session_id,DateTime,user_id,product,campaign_id,webpage_id,product_category_1,product_category_2,user_group_id,gender,age_level,user_depth,city_development_index,var_1,is_click
0,0,2017-07-02 00:00,858557,C,359520,13787,4,,10.0,Female,4.0,3.0,3.0,0,0
1,1,2017-07-02 00:00,243253,C,105960,11085,5,,8.0,Female,2.0,2.0,,0,0
2,2,2017-07-02 00:00,243253,C,359520,13787,4,,8.0,Female,2.0,2.0,,0,0
3,3,2017-07-02 00:00,1097446,I,359520,13787,3,,3.0,Male,3.0,3.0,2.0,1,0
4,4,2017-07-02 00:01,663656,C,405490,60305,3,,2.0,Male,2.0,3.0,2.0,1,0


# read data 

In [44]:
def read_data(model_name):
    # train prep
    p = './'+str(model_name)+'/train/'
    files = os.listdir(p)
    temp_train = pd.DataFrame({})
    for f in files:
        temp = pd.read_csv(p+f)
        temp_train = pd.concat([temp_train, temp],axis=0)
    temp_train = temp_train.sort_values(by='DF_index').reset_index(drop=True)
    temp_train = temp_train.rename({'pred':model_name}, axis=1)
    temp_train[model_name] = temp_train[model_name].rank()
    temp_train[model_name] = (temp_train[model_name] - np.min(temp_train[model_name])) / (np.max(temp_train[model_name]) - np.min(temp_train[model_name]))
    temp_train = temp_train.drop('DF_index', axis=1)
    
    # test prep
    p = './'+str(model_name)
    test_preds = pd.DataFrame({})
    for f in os.listdir(p+'/test'):
        temp = pd.read_csv(p+'/test/'+f)
        temp.columns = [f]
        test_preds = pd.concat([test_preds, temp], axis=1)
    test_preds = test_preds.rank().mean(axis=1)
    test_preds = pd.DataFrame(test_preds, columns=[model_name])
    test_preds[model_name] = (test_preds[model_name] - np.min(test_preds[model_name])) / (np.max(test_preds[model_name]) - np.min(test_preds[model_name]))
    
    return temp_train, test_preds

In [45]:
train_st = pd.DataFrame({})
test_st = pd.DataFrame({})
for m in tqdm(['model_22', 'model_18', 'model_21']):
    train_temp, test_temp = read_data(m)
    train_st = pd.concat([train_st, test_temp], axis=1)
    test_st = pd.concat([test_st, test_temp], axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.33s/it]


In [52]:
test_st.corr('spearman')

Unnamed: 0,model_22,model_18,model_21
model_22,1.0,0.977332,0.988878
model_18,0.977332,1.0,0.987
model_21,0.988878,0.987,1.0


In [48]:
ssub['is_click'] = test_st.rank().mean(axis=1) #test_preds.mean(axis=1)
ssub.to_csv(ModelFolder+model_name+'_submit.csv',index=False)

In [50]:
ssub.head()

Unnamed: 0,session_id,is_click
0,411705,118034.666667
1,208263,17369.5
2,239450,15017.666667
3,547761,46232.666667
4,574275,84327.333333
