In [1]:
import os
import numpy as np
import pandas as pd
import math
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

data_path = 'data/'

In [132]:
dfoff = pd.read_csv(os.path.join(data_path,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(data_path,'test_offline.csv'))
dfoff = dfoff[~dfoff.Coupon_id.isna()]
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(746969, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,
11,253750,8390,7531.0,20:5,0.0,20160327.0,


In [133]:
dftest.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
2,1439408,2632,8591.0,20:1,0.0,20160516.0
3,2029232,450,1532.0,30:5,0.0,20160530.0
4,2029232,6459,12737.0,20:1,0.0,20160519.0
5,2747744,6901,1097.0,50:10,,20160606.0
6,196342,1579,10698.0,20:1,1.0,20160606.0
7,253750,6901,2366.0,30:5,0.0,20160518.0
8,343660,4663,11002.0,150:20,,20160528.0
9,1113008,3621,2705.0,20:5,0.0,20160524.0


In [134]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

0    710665
1     36304
Name: label, dtype: int64

In [135]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [136]:
dfoff.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0


In [137]:
dfoff['User_id'].value_counts()

5054119    92
6641735    87
2839484    80
4917111    77
501441     73
2520933    69
1605133    60
2507268    57
6655171    52
2940145    45
2190887    45
5787896    42
2709037    40
2956333    38
1350500    37
5244936    36
2181092    35
649762     34
2591861    34
1583968    33
3925478    33
5291080    32
3118313    32
1485320    32
4119260    32
1836772    32
5603895    31
1579770    30
900738     29
2751537    29
           ..
1967161     1
6443109     1
857230      1
5088408     1
4438386     1
6128796     1
3949732     1
6935020     1
810151      1
4992169     1
1836204     1
717938      1
1725542     1
543781      1
2673132     1
2634794     1
6874160     1
6859833     1
748609      1
5983301     1
2480276     1
5977162     1
4281481     1
4920398     1
783442      1
2882643     1
771156      1
6020183     1
2858079     1
6089188     1
Name: User_id, Length: 392743, dtype: int64

In [138]:
dfoff['User_id'].value_counts().describe()

count    392743.000000
mean          1.901928
std           1.786997
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          92.000000
Name: User_id, dtype: float64

In [139]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [140]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

### 自己多加的特徵

#### 距離加上指數函數

In [141]:
dfoff['exp_Distance'] = np.exp(dfoff['Distance'])
dftest['exp_Distance'] = np.exp(dftest['Distance'])

#### 距離的折抵率次方，代表即使距離近，也會因折抵率有所影響

In [142]:
dfoff['Distance^Discount_rate'] = dfoff['Distance']**dfoff['discount_rate']
dftest['Distance^Discount_rate'] = dftest['Distance']**dftest['discount_rate']

#### 以Discount_rate為基準來跟User_id拿到的情況做編碼

In [143]:
# 加上 'Cabin' 欄位的計數編碼
count_dfoff = dfoff.groupby(['Discount_rate'])['User_id'].agg({'Discount_rate_Count':'size'}).reset_index()
dfoff = pd.merge(dfoff, count_dfoff, on=['Discount_rate'], how='left')
count_dfoff.sort_values(by=['Discount_rate_Count'], ascending=False).head(5)

is deprecated and will be removed in a future version
  


Unnamed: 0,Discount_rate,Discount_rate_Count
38,30:5,172567
10,100:10,166693
24,200:20,106875
30,20:5,47990
43,50:5,36809


In [144]:
count_dftest= dftest.groupby(['Discount_rate'])['User_id'].agg({'Discount_rate_Count':'size'}).reset_index()
dftest = pd.merge(dftest, count_dftest, on=['Discount_rate'], how='left')
count_dftest.sort_values(by=['Discount_rate_Count'], ascending=False).head(5)

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


Unnamed: 0,Discount_rate,Discount_rate_Count
37,30:5,98145
29,20:5,43023
27,20:1,36719
9,100:10,15861
17,150:20,13026


In [145]:
dftest.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekday_type,weekday_1,weekday_2,...,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,exp_Distance,Distance^Discount_rate,Discount_rate_Count
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,0,0,0,...,0,1,0,0.866667,150,20,1,2.718282,1.0,13026
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0,1,0,...,0,0,0,0.95,20,1,1,1.0,0.0,36719
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0,1,0,...,0,0,0,0.95,20,1,1,1.0,0.0,36719
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0,1,0,...,0,0,0,0.833333,30,5,1,1.0,0.0,98145
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0,0,0,...,0,0,0,0.95,20,1,1,1.0,0.0,36719


In [146]:
dfoff.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,exp_Distance,Distance^Discount_rate,Discount_rate_Count
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,...,0,0,0,0.95,20,1,1,1.0,0.0,14986
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,0,...,0,1,0,0.95,20,1,1,1.0,0.0,14986
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0,...,1,0,0,0.9,200,20,1,1.0,0.0,106875
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,...,1,0,0,0.9,200,20,1,2.718282,1.0,106875
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,...,1,0,0,0.5,10,5,1,7.389056,1.414214,20085


### 自己多加的特徵

In [147]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [148]:
train.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'exp_Distance',
       'Distance^Discount_rate', 'Discount_rate_Count', 'is_train'],
      dtype='object')

In [149]:
len(train.columns)

25

In [150]:
valid.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'exp_Distance',
       'Distance^Discount_rate', 'Discount_rate_Count', 'is_train'],
      dtype='object')

In [151]:
len(valid.columns)

25

In [152]:
len(dftest.columns)

22

In [153]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type',
                    'exp_Distance', 
                    'Distance^Discount_rate',
                    'Discount_rate_Count'
                    ]+ weekdaycols
print(len(original_feature),original_feature)

17 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'exp_Distance', 'Distance^Discount_rate', 'Discount_rate_Count', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [154]:
predictors = original_feature
print(predictors)

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'exp_Distance', 'Distance^Discount_rate', 'Discount_rate_Count', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [155]:
def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [156]:
train

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,exp_Distance,Distance^Discount_rate,Discount_rate_Count,is_train
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,...,0,0,0.950000,20,1,1,1.000000e+00,0.000000,14986,True
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,0,...,1,0,0.950000,20,1,1,1.000000e+00,0.000000,14986,True
2,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,...,0,0,0.900000,200,20,1,2.718282e+00,1.000000,106875,True
3,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,...,0,0,0.500000,10,5,1,7.389056e+00,1.414214,20085,True
4,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7,0,...,0,1,0.900000,100,10,1,9.889030e+42,62.527588,166693,True
5,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6,0,...,1,0,0.900000,200,20,1,2.202647e+04,7.943282,106875,True
6,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2,0,...,0,0,0.900000,200,20,1,7.389056e+00,1.866066,106875,True
7,253750,8390,7531.0,20:5,0.0,20160327.0,,0,7,0,...,0,1,0.750000,20,5,1,1.000000e+00,0.000000,47990,True
8,376492,1041,13490.0,30:5,2.0,20160127.0,,0,3,0,...,0,0,0.833333,30,5,1,7.389056e+00,1.781797,172567,True
9,1964720,7884,6704.0,20:1,10.0,20160215.0,,0,1,0,...,0,0,0.950000,20,1,1,2.202647e+04,8.912509,14986,True


In [157]:
len(train.columns)

25

In [158]:
len(dftest.columns)

22

In [159]:
len(predictors)

17

In [160]:
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   13.6s finished


In [161]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [162]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.784, Accuracy: 0.952


In [163]:
predictors

['discount_rate',
 'discount_type',
 'discount_man',
 'discount_jian',
 'Distance',
 'weekday',
 'weekday_type',
 'exp_Distance',
 'Distance^Discount_rate',
 'Discount_rate_Count',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6',
 'weekday_7']

In [164]:
targetset.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7',
       'discount_rate', 'discount_man', 'discount_jian', 'discount_type',
       'exp_Distance', 'Distance^Discount_rate', 'Discount_rate_Count_x',
       'Distance_Count', 'Distance_discount_type', 'Discount_rate_Count_y'],
      dtype='object')

In [165]:
dftest.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7',
       'discount_rate', 'discount_man', 'discount_jian', 'discount_type',
       'exp_Distance', 'Distance^Discount_rate', 'Discount_rate_Count'],
      dtype='object')

In [166]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

(306313, 22)


In [167]:
testset[predictors]

Unnamed: 0,discount_rate,discount_type,discount_man,discount_jian,Distance,weekday,weekday_type,exp_Distance,Distance^Discount_rate,Discount_rate_Count,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,0.866667,1,150,20,1.0,6,0,2.718282e+00,1.000000,13026,0,0,0,0,0,1,0
1,0.950000,1,20,1,0.0,1,0,1.000000e+00,0.000000,36719,1,0,0,0,0,0,0
2,0.950000,1,20,1,0.0,1,0,1.000000e+00,0.000000,36719,1,0,0,0,0,0,0
3,0.833333,1,30,5,0.0,1,0,1.000000e+00,0.000000,98145,1,0,0,0,0,0,0
4,0.950000,1,20,1,0.0,4,0,1.000000e+00,0.000000,36719,0,0,0,1,0,0,0
5,0.800000,1,50,10,99.0,1,0,9.889030e+42,39.491912,2908,1,0,0,0,0,0,0
6,0.950000,1,20,1,1.0,1,0,2.718282e+00,1.000000,36719,1,0,0,0,0,0,0
7,0.833333,1,30,5,0.0,3,0,1.000000e+00,0.000000,98145,0,0,1,0,0,0,0
8,0.866667,1,150,20,99.0,6,0,9.889030e+42,53.647625,13026,0,0,0,0,0,1,0
9,0.750000,1,20,5,0.0,2,0,1.000000e+00,0.000000,43023,0,1,0,0,0,0,0


In [168]:
testset.columns

Index(['discount_rate', 'discount_type', 'discount_man', 'discount_jian',
       'Distance', 'weekday', 'weekday_type', 'exp_Distance',
       'Distance^Discount_rate', 'Discount_rate_Count', 'weekday_1',
       'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
       'weekday_7'],
      dtype='object')

In [169]:
len(testset.columns)

17

In [170]:
y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 18)


In [171]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [172]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
# out.to_csv("baseline_example.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.135023
1,1000020_8192_20160513,0.10937
2,1000065_1455_20160527,0.100243
3,1000085_8067_20160513,0.083289
4,1000086_2418_20160613,0.070221


In [173]:
output.shape

(306313, 5)

In [174]:
output.head()

Unnamed: 0,User_id,Coupon_id,Date_received,pred_prob,uid
0,1439408,11002,20160528,0.019307,1439408_11002_20160528
1,1439408,8591,20160613,0.088933,1439408_8591_20160613
2,1439408,8591,20160516,0.088933,1439408_8591_20160516
3,2029232,1532,20160530,0.070221,2029232_1532_20160530
4,2029232,12737,20160519,0.129886,2029232_12737_20160519


In [175]:
output_ = output.drop(columns=['User_id', 'Coupon_id', 'Date_received']).rename(columns={'pred_prob':'label'}).reindex(columns=['uid','label'])

In [176]:
output_.shape

(306313, 2)

In [177]:
a=output_['uid'].unique().shape

In [178]:
output_.to_csv('Result.csv',index=False)

In [179]:
output_ = output_.drop_duplicates(keep='first', inplace=False)  # 刪除重複

In [180]:
output_.to_csv('Result.csv',index=False)