In [1]:
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell #Show all consecutive outputs
InteractiveShell.ast_node_interactivity = "all"

#Removes all unnecessary warnings by Python
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
import json

import scipy as sp

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.shape

(766787, 3)

In [4]:
test.shape

(3000, 1)

In [4]:
train['PID'].value_counts()

1011411    2476
1005873    2469
1006015    2339
1023020    2329
1000793    2296
1025840    2011
1011099    1976
1012771    1869
1013779    1861
1029173    1855
1010524    1741
1031190    1740
1030454    1595
1004294    1580
1007271    1537
1027290    1517
1009171    1502
1020435    1480
1003793    1474
1013333    1433
1010610    1378
1009466    1375
1013430    1373
1007640    1371
1028094    1328
1002708    1318
1031617    1306
1016736    1303
1008272    1286
1027831    1224
           ... 
1018393      28
1012414      28
1026168      27
1017196      26
1029864      26
1014239      26
1019299      26
1025645      25
1020497      25
1015666      23
1020115      21
1018233      20
1002208      20
1027369      20
1017984      19
1006122      19
1022905      18
1010074      18
1022979      18
1006395      18
1009665      17
1016810      17
1001510      17
1027946      16
1001508      15
1021496      15
1009474      14
1016210      14
1029677      14
1025352      11
Name: PID, Length: 3000,

In [5]:
train.head()

Unnamed: 0,PID,Date,Event
0,1028890,201101,2186
1,1028890,201101,7087
2,1028890,201101,4848
3,1028890,201101,2214
4,1028890,201102,7087


In [6]:
test.head()

Unnamed: 0,PID
0,1028890
1,1021443
2,1016194
3,1017298
4,1006098


In [7]:
train.isnull().any()

PID      False
Date     False
Event    False
dtype: bool

In [10]:
test['PID'].isin(train['PID']).value_counts()

True    3000
Name: PID, dtype: int64

In [12]:
train['PID'].isin(test['PID']).value_counts()

True    766787
Name: PID, dtype: int64

In [13]:
train['PID'].value_counts()

1011411    2476
1005873    2469
1006015    2339
1023020    2329
1000793    2296
1025840    2011
1011099    1976
1012771    1869
1013779    1861
1029173    1855
1010524    1741
1031190    1740
1030454    1595
1004294    1580
1007271    1537
1027290    1517
1009171    1502
1020435    1480
1003793    1474
1013333    1433
1010610    1378
1009466    1375
1013430    1373
1007640    1371
1028094    1328
1002708    1318
1031617    1306
1016736    1303
1008272    1286
1027831    1224
           ... 
1018393      28
1012414      28
1026168      27
1017196      26
1029864      26
1014239      26
1019299      26
1025645      25
1020497      25
1015666      23
1020115      21
1018233      20
1002208      20
1027369      20
1017984      19
1006122      19
1022905      18
1010074      18
1022979      18
1006395      18
1009665      17
1016810      17
1001510      17
1027946      16
1001508      15
1021496      15
1009474      14
1016210      14
1029677      14
1025352      11
Name: PID, Length: 3000,

In [14]:
test['PID'].value_counts()

1028094    1
1031483    1
1011017    1
1025352    1
1019207    1
1020862    1
1012596    1
1017152    1
1027391    1
1021246    1
1015727    1
1017146    1
1002396    1
1019193    1
1000349    1
1015095    1
1024687    1
1022014    1
1000756    1
1010995    1
1020404    1
1027377    1
1025354    1
1019211    1
1017164    1
1000782    1
1015145    1
1027431    1
1001979    1
1013090    1
          ..
1024615    1
1008229    1
1026660    1
1002262    1
1008225    1
1030750    1
1020539    1
1002387    1
1006234    1
1020543    1
1030808    1
1005360    1
1006230    1
1006228    1
1016467    1
1024733    1
1000079    1
1017093    1
1014782    1
1012363    1
1026698    1
1030792    1
1024647    1
1022598    1
1003511    1
1028739    1
1012788    1
1028737    1
1000043    1
1015808    1
Name: PID, Length: 3000, dtype: int64

In [11]:
train.head()

Unnamed: 0,PID,Date,Event,year,month
0,1028890,2011-01-01,2186,2011,1
1,1028890,2011-01-01,7087,2011,1
2,1028890,2011-01-01,4848,2011,1
3,1028890,2011-01-01,2214,2011,1
4,1028890,2011-02-01,7087,2011,2


In [10]:
train['Date']=train['Date'].astype(str)
from datetime import datetime
train['Date']=train['Date'].apply(lambda x: datetime(day=int(1), month=int(x[4:6]),year=int(x[0:4])))
train['year']=train['Date'].apply(lambda x: x.year)
train['month']=train['Date'].apply(lambda x: x.month)

ValueError: month must be in 1..12

In [17]:
grouped_df=train.groupby('PID')
ntrain=pd.DataFrame()
ntest=pd.DataFrame()
count=0
for row,group in grouped_df:
    if group.shape[0]<20:
        continue
    ntest=pd.concat([ntest,group[:][-10:]],axis=0)
    ntrain=pd.concat([ntrain,group[:][:-10]],axis=0)
    if count%1000==0:
        print(count)
    count+=1

0
1000
2000


In [18]:
ntest=ntest.sort_values(by='Date')
grouped_by=ntest.groupby("PID")
test_dict={}
for name,group in grouped_by:
    test_dict[name]=group['Event'].tolist()
nd=pd.DataFrame(columns=['PID','Event1','Event2','Event3','Event4','Event5','Event6','Event7','Event8','Event9','Event10'])
n=0
for i in (ntest['PID'].value_counts().index).tolist():
    nd.loc[n,'PID']=i
    nd.loc[n,'Event1']=test_dict[i][0]
    nd.loc[n,'Event2']=test_dict[i][1]
    nd.loc[n,'Event3']=test_dict[i][2]
    nd.loc[n,'Event4']=test_dict[i][3]
    nd.loc[n,'Event5']=test_dict[i][4]
    nd.loc[n,'Event6']=test_dict[i][5]
    nd.loc[n,'Event7']=test_dict[i][6]
    nd.loc[n,'Event8']=test_dict[i][7]
    nd.loc[n,'Event9']=test_dict[i][8]
    nd.loc[n,'Event10']=test_dict[i][9]
    n+=1
    if n%1000==0:
        print(n)

1000
2000


In [20]:
grouped_df=train.groupby('PID')
first_ten={}
for name,group in grouped_df:
    first_ten[name]=group['Event'].value_counts().index[0:10].tolist()
nd=pd.DataFrame(columns=['Event1','Event2','Event3','Event4','Event5','Event6','Event7','Event8','Event9','Event10'])
submit=pd.concat([test,nd],axis=1)
n=0
for i in test['PID']:
    submit.loc[n,'Event1']=first_ten[i][0]
    submit.loc[n,'Event2']=first_ten[i][1]
    submit.loc[n,'Event3']=first_ten[i][2]
    submit.loc[n,'Event4']=first_ten[i][3]
    submit.loc[n,'Event5']=first_ten[i][4]
    submit.loc[n,'Event6']=first_ten[i][5]
    submit.loc[n,'Event7']=first_ten[i][6]
    submit.loc[n,'Event8']=first_ten[i][7]
    submit.loc[n,'Event9']=first_ten[i][8]
    submit.loc[n,'Event10']=first_ten[i][9]
    n+=1
    if n%1000==0:
        print(n)

1000
2000
3000


In [22]:
# submit.to_csv("sub01.csv",index=False)

In [12]:
train['ordered_months']=train['month']
train['ordered_months'][train['year']==2012]=12+train['month'][train['year']==2012]
train['ordered_months'][train['year']==2013]=24+train['month'][train['year']==2013]

In [13]:
train.shape

(766787, 6)

In [14]:
train_data=train.copy()

In [204]:
train_data=pd.DataFrame()
grouped_by=train.groupby('PID')
for name,group in grouped_by:
    if group.shape[0]>1000:
        group=group[:][-1000:]
    train_data=pd.concat([train_data,group],axis=0)

In [205]:
train_data=train_data.reset_index()
#
np.power((train_data['ordered_months'])/8,3)

In [55]:
train_data['weightage']=np.exp(-(37-train_data['ordered_months'])/8)#+np.sin(train_data['month']+12)/24

In [56]:
# train_data['weightage']=np.log(-(37-train_data['ordered_months'])/8)+np.sin(train_data['month']+12)/24

In [57]:
sample=train_data.groupby(['PID','Event'])['weightage'].apply(lambda x: sum(x))

In [58]:
sample2=sample.unstack(fill_value=0)

In [59]:
sub09=sample2.apply(lambda x: pd.Series(x.sort_values(ascending=False).iloc[:10].index, index=['Event'+str(x) for x in range(1,11)]),axis=1).reset_index()

In [62]:
sub09.drop(['Event6','Event7','Event8','Event9','Event10'],axis=1,inplace=True)

In [64]:
sub09['Event6']=sub09['Event1']
sub09['Event7']=sub09['Event2']
sub09['Event8']=sub09['Event3']
sub09['Event9']=sub09['Event4']
sub09['Event10']=sub09['Event5']

In [65]:
sub09

Unnamed: 0,PID,Event1,Event2,Event3,Event4,Event5,Event6,Event7,Event8,Event9,Event10
0,1000001,2632,3074,3263,3680,2221,2632,3074,3263,3680,2221
1,1000011,8502,9928,8100,9921,8470,8502,9928,8100,9921,8470
2,1000019,9928,9920,7194,7682,8340,9928,9920,7194,7682,8340
3,1000025,2674,2635,3641,3419,3410,2674,2635,3641,3419,3410
4,1000029,2533,3638,2773,9921,4011,2533,3638,2773,9921,4011
5,1000043,9080,3092,3082,3591,9921,9080,3092,3082,3591,9921
6,1000068,2334,9928,2801,9476,4660,2334,9928,2801,9476,4660
7,1000075,1963,2777,3317,2809,9921,1963,2777,3317,2809,9921
8,1000079,9928,V586,4140,3320,4019,9928,V586,4140,3320,4019
9,1000082,3324,2627,3272,3729,7153,3324,2627,3272,3729,7153


In [66]:
sub09.to_csv("sub14.csv",index=False)

In [28]:
check=pd.read_csv("benchmark.csv")

In [29]:
import math
def ndcg_at_k(r,d, k):
#     r = np.asfarray(r)[:k]
    new_arr=[0]*10
    r=list(np.asfarray(r))
    d=list(np.asfarray(d))
    for i in range(10):
        if r[i]==d[i]:
            new_arr[i]=1
        elif r[i] in d:
            pos=d.index(r[i])
            new_arr[i]=min(((pos+1)/(i+1)),((i+1)/(pos+1)))
        else:
            new_arr[i]=0
    rel=np.asfarray(new_arr)
    result=0
    for i in range(10):
        result+= (pow(2,rel[i])-1)/(math.log((i+2),2))
    return result/4.54355

In [30]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit(train['Event'])

LabelEncoder()

In [36]:
result=0
for i in check['PID']:
    for row in sub09[:][sub09['PID']==i].iterrows():
          index1,data1 = row
    for row in check[:][check['PID']==i].iterrows():
          index2,data2 = row        
    reference_list =data2.tolist()
    predicted_list = data1.tolist()
    predicted_list=predicted_list[1:]
    reference_list=reference_list[1:]
    predicted_list=le.transform(predicted_list)
    reference_list=le.transform(reference_list)
    result+=ndcg_at_k(predicted_list,reference_list,k=10)
print result/3000

0.0


In [None]:
#18 replaced by 9 is better - 0.690048
#18 replaced by 8 is better - 0.7426816
#18 replaced by 7 - 0.71394
#cube by 8 is -0.61
#cube by 8 + exponential gives -0.61
#cube by 8 + exponential gives + month - 0.63
#cube by 8 + exponential gives + sin(month)
#exponential+sin(month)-0.63