In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
pd.set_option('display.max_columns', 500)

In [2]:
data_path = 'data/coupon/'
df_train = pd.read_csv(data_path+'train_offline.csv')
df_test = pd.read_csv(data_path+'test_offline.csv') 

In [3]:
print(df_train.shape,df_test.shape)

(1160742, 7) (594142, 6)


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1160742 entries, 0 to 1160741
Data columns (total 7 columns):
User_id          1160742 non-null int64
Merchant_id      1160742 non-null int64
Coupon_id        746969 non-null float64
Discount_rate    746969 non-null object
Distance         1090916 non-null float64
Date_received    746969 non-null float64
Date             456709 non-null float64
dtypes: float64(4), int64(2), object(1)
memory usage: 62.0+ MB


In [5]:
df_train.describe()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Date
count,1160742.0,1160742.0,746969.0,1090916.0,746969.0,456709.0
mean,3690549.0,4103.626,7090.106124,2.726183,20160220.0,20160290.0
std,2123784.0,2364.912,4159.04235,3.687066,114.2962,111.74
min,4.0,1.0,4.0,0.0,20160100.0,20160100.0
25%,1845197.0,2146.0,3304.0,0.0,20160130.0,20160200.0
50%,3697362.0,3532.0,7610.0,1.0,20160200.0,20160320.0
75%,5532561.0,6412.0,10323.0,4.0,20160320.0,20160400.0
max,7361032.0,8856.0,14045.0,10.0,20160430.0,20160630.0


In [6]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_test):
    data_na = (df_test.isna().sum()/len(df_test))*100
    data_na = (data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False))
    missing_data = pd.DataFrame({'Missing Ratio' : data_na})
    display(missing_data.head(10))
na_check(df_test)
df_test.head(10)

Unnamed: 0,Missing Ratio
Date_received,48.44448
Discount_rate,48.44448
Coupon_id,48.44448
Distance,6.088948


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
2,1439408,2632,,,0.0,
3,1439408,2632,8591.0,20:1,0.0,20160516.0
4,2029232,450,1532.0,30:5,0.0,20160530.0
5,2029232,6459,12737.0,20:1,0.0,20160519.0
6,2029232,6459,,,0.0,
7,2029232,6459,,,0.0,
8,2747744,6901,1097.0,50:10,,20160606.0
9,196342,1579,,,1.0,


In [7]:
# 檢查欄位缺值數量 (去掉.head()可以顯示全部)
df_test.isnull().sum().sort_values(ascending=False).head()

Date_received    287829
Discount_rate    287829
Coupon_id        287829
Distance          36177
Merchant_id           0
dtype: int64

In [8]:
#當在最前方加上「~」符號，即表示為「不」
#通过~取反，选取不包含数字na的行
df_train = df_train[~df_train['Coupon_id'].isna()]
df_train.reset_index(drop=True, inplace=True)


df_test = df_test[~df_test['Coupon_id'].isna()]
df_test.reset_index(drop=True, inplace=True)
print(df_train.shape)
print(df_test.shape)
df_test.head(10)

(746969, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
2,1439408,2632,8591.0,20:1,0.0,20160516.0
3,2029232,450,1532.0,30:5,0.0,20160530.0
4,2029232,6459,12737.0,20:1,0.0,20160519.0
5,2747744,6901,1097.0,50:10,,20160606.0
6,196342,1579,10698.0,20:1,1.0,20160606.0
7,253750,6901,2366.0,30:5,0.0,20160518.0
8,343660,4663,11002.0,150:20,,20160528.0
9,1113008,3621,2705.0,20:5,0.0,20160524.0


「預測用戶在2016年6月領取優惠券後15天以內的使用情況」

意思是要預測用戶會不會「在六月份使用優惠券消費且在領取後15天之內」的意思嗎？

意即：

1.在六月份無消費：Label = False

2.在六月份有消費但沒使用優惠券：Label = False

3.在六月份有使用優惠券消費，但超過領取後15天：Label = False

4.在六月份有使用優惠券消費，且在領取後15天之內：Label = True


如果以上題意解讀是正確的

就代表test_offline.csv裡：

Date_received = NaN者可直接判定為False，因為他根本沒有優惠券
Date_received在2019-05-16之前者可直接判定為False，因為即使他在六月有用優惠券消費，但也不會在15天之內了 

In [9]:
DataTestTemp = df_test.copy()
DataTestTemp.loc[:, "User_id"] = DataTestTemp["User_id"].apply(lambda x:str(int(x)))
DataTestTemp.loc[:, "Coupon_id"] =DataTestTemp["Coupon_id"].apply(lambda x:str(int(x)))
DataTestTemp.loc[:, "Date_received"] = DataTestTemp["Date_received"].apply(lambda x:str(int(x)))
DataTestTemp["uid"] = DataTestTemp[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
uid = DataTestTemp['uid']

In [10]:
#建立target lable
target = []
for ind in df_train.index:
    if math.isnan(df_train.loc[ind, 'Date']):
        target.append(0.0)
    else:
        target.append(1.0)
TargetTrain = pd.Series({'CouponUsed':target})

In [11]:
try:
    df_train = df_train.drop(['Date'],axis=1)
except:
    df_train = df_train
    
DataSet = pd.concat([df_train, df_test],axis=0, ignore_index=True)
print("Shape of Data Set : ",DataSet.shape)
DataSet.sample(10)

Shape of Data Set :  (1053282, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
561053,1634346,2709,2840.0,100:10,10.0,20160127.0
995166,4867605,2436,12462.0,20:5,0.0,20160601.0
635593,4770354,8614,11571.0,0.95,2.0,20160102.0
528722,483131,7909,13333.0,100:10,0.0,20160129.0
1023232,4597938,3117,3343.0,20:1,0.0,20160605.0
413901,6812686,1569,5054.0,200:30,2.0,20160408.0
573292,2692738,5341,11539.0,30:5,0.0,20160206.0
723990,354721,4660,1480.0,100:10,10.0,20160125.0
24520,335992,2709,2840.0,100:10,6.0,20160126.0
226792,3401484,7011,8785.0,20:5,0.0,20160325.0


In [12]:
print(DataSet.shape)
len(TargetTrain.values[0])

(1053282, 6)


746969

In [13]:
for col in DataSet:
    print(col,' : ',DataSet[col].dtype)

User_id  :  int64
Merchant_id  :  int64
Coupon_id  :  float64
Discount_rate  :  object
Distance  :  float64
Date_received  :  float64


### 填補空缺值

In [14]:
DataSet['Distance'] = DataSet['Distance'].fillna(DataSet['Distance'].median())
DataSet['DiscountTemp'] = DataSet['Discount_rate'].map(lambda s:s.split(':'))

In [15]:
DataSet['Discount_off'] = DataSet['DiscountTemp'].map(lambda s: 1-float(s[0]) if len(s)==1 else float(s[1])/float(s[0]))
DataSet['Discount_type'] = DataSet['DiscountTemp'].map(lambda s: 'Any' if len(s)==1 else 'Full')
DataSet['Discount_Full'] = DataSet['DiscountTemp'].map(lambda s: 0.0 if len(s)==1 else float(s[0]))
DataSet = DataSet.drop(['DiscountTemp','Discount_rate'],axis=1)
DataSet.sample(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Discount_off,Discount_type,Discount_Full
156256,4507502,3381,1807.0,1.0,20160123.0,0.1,Full,300.0
103477,534204,4660,1480.0,8.0,20160126.0,0.1,Full,100.0
250242,378861,2099,12034.0,0.0,20160207.0,0.1,Full,100.0
207520,4908979,7019,3887.0,10.0,20160128.0,0.1,Full,100.0
56260,4159544,2099,12034.0,0.0,20160207.0,0.1,Full,100.0


In [16]:
print(np.where(DataSet.isna()))
DataSet.sample(5)

(array([], dtype=int64), array([], dtype=int64))


Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Discount_off,Discount_type,Discount_Full
891472,2842356,2709,13165.0,0.0,20160518.0,0.1,Full,100.0
15900,2466206,5341,11539.0,0.0,20160131.0,0.166667,Full,30.0
577092,156777,3381,1807.0,4.0,20160126.0,0.1,Full,300.0
831617,4914079,3621,2705.0,0.0,20160511.0,0.25,Full,20.0
427342,7257322,2099,12034.0,0.0,20160127.0,0.1,Full,100.0


In [17]:
# 時間特徵拆解
# DataSet['Month_received'] = DataSet['Date_received'].map(lambda m:int((m%10000)/100))
# DataSet['Month_received'] = DataSet['Month_received'].apply(lambda x:math.sin(x/3*math.pi))
# DataSet['Weeks_received'] = DataSet['Date_received'].map(lambda m: int(datetime.strptime(str(int(m)),'%Y%m%d').strftime("%W")))
# DataSet['Weeks_received'] = DataSet['Weeks_received'].apply(lambda x:math.sin(x/13*math.pi))
# DataSet['Day_received'] = DataSet['Date_received'].map(lambda m: int(m%100))
# DataSet['Day_received'] = DataSet['Day_received'].apply(lambda x:math.cos(x/31*math.pi))
DataSet['Weekday_received'] = DataSet['Date_received'].map(lambda m: int(datetime.strptime(str(int(m)),'%Y%m%d').weekday()))
DataSet['Weekday_received'] = DataSet['Weekday_received'].apply(lambda x:math.cos((x+1)/7*math.pi+math.pi))
# 星期數周期化

DataSet.sample(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Discount_off,Discount_type,Discount_Full,Weekday_received
746001,889431,6485,10323.0,0.0,20160325.0,0.25,Full,20.0,0.62349
825290,6771134,797,9362.0,1.0,20160612.0,0.166667,Full,30.0,1.0
559959,5468320,1041,13490.0,7.0,20160126.0,0.166667,Full,30.0,-0.62349
633619,5715474,6284,7379.0,10.0,20160125.0,0.2,Full,50.0,-0.900969
1035041,5048135,2709,13165.0,0.0,20160520.0,0.1,Full,100.0,0.62349


In [18]:
#折扣均值編碼
# meanMerchant = DataSet.groupby('Merchant_id')['Discount_off'].mean().reset_index()
# DataSet = pd.merge(DataSet,meanMerchant,how='left',on=['Merchant_id'])
# DataSet = DataSet.rename(columns={'Discount_off_x':'Discount_off', 'Discount_off_y':'Discount_MeanByMer'})

meanMerchant = DataSet.groupby('Merchant_id')['Distance'].mean().reset_index()
DataSet = pd.merge(DataSet,meanMerchant,how='left',on=['Merchant_id'])
DataSet = DataSet.rename(columns={'Distance_x':'Distance', 'Distance_y':'Distance_MeanByMer'})


meanCoupon = DataSet.groupby('Coupon_id')['Discount_off'].mean().reset_index()
DataSet = pd.merge(DataSet,meanCoupon,how='left',on=['Coupon_id'])
DataSet = DataSet.rename(columns={'Discount_off_x':'Discount_off', 'Discount_off_y':'Discount_off_MeanByCop'})


# meanDistance = DataSet.groupby('Distance')['Discount_Full'].mean().reset_index()
# DataSet = pd.merge(DataSet,meanDistance,how='left',on=['Distance'])
# DataSet = DataSet.rename(columns={'Discount_Full_x':'Discount_Full', 'Discount_Full_y':'Discount_Full_MeanByDist'})

DataSet.sample(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Distance,Date_received,Discount_off,Discount_type,Discount_Full,Weekday_received,Distance_MeanByMer,Discount_off_MeanByCop
281901,2153442,4833,7802.0,10.0,20160131.0,0.1,Full,200.0,1.0,6.957217,0.1
417950,6273106,7896,1430.0,8.0,20160414.0,0.05,Full,20.0,0.222521,4.75,0.05
27007,390649,6424,1315.0,0.0,20160129.0,0.1,Full,100.0,0.62349,4.232532,0.1
713250,5296331,3710,8680.0,0.0,20160126.0,0.1,Full,100.0,-0.62349,3.756123,0.1
208885,7264231,2709,2840.0,6.0,20160128.0,0.1,Full,100.0,0.222521,4.630286,0.1


In [19]:
DataSet = DataSet.drop(['User_id'], axis=1)
DataSet.head(5)

Unnamed: 0,Merchant_id,Coupon_id,Distance,Date_received,Discount_off,Discount_type,Discount_Full,Weekday_received,Distance_MeanByMer,Discount_off_MeanByCop
0,2632,8591.0,0.0,20160217.0,0.05,Full,20.0,-0.222521,1.372093,0.05
1,2632,1078.0,0.0,20160319.0,0.05,Full,20.0,0.900969,1.372093,0.05
2,3381,7610.0,0.0,20160429.0,0.1,Full,200.0,0.62349,2.887906,0.1
3,3381,11951.0,1.0,20160129.0,0.1,Full,200.0,0.62349,2.887906,0.1
4,3381,9776.0,2.0,20160129.0,0.5,Full,10.0,0.62349,2.887906,0.5


In [20]:
# DataSet['DivisionDoffDist'] = DataSet['Discount_off']/(DataSet['Distance']+1e-3) 
# DataSet.sample(5)

In [21]:
DataSet["Merchant_id_"] = DataSet["Merchant_id"].map(lambda i: np.log(i) if i > 0 else 0)
DataSet["Coupon_id_"] = DataSet["Coupon_id"].map(lambda i: np.log(i) if i > 0 else 0)
DataSet["Discount_Full_"] = DataSet["Discount_Full"].map(lambda i: np.log(i) if i > 0 else 0)

In [22]:
DataSet_dum = pd.get_dummies(DataSet)

In [23]:
print(DataSet_dum.shape)
DataSet_dum.sample(10)

(1053282, 14)


Unnamed: 0,Merchant_id,Coupon_id,Distance,Date_received,Discount_off,Discount_Full,Weekday_received,Distance_MeanByMer,Discount_off_MeanByCop,Merchant_id_,Coupon_id_,Discount_Full_,Discount_type_Any,Discount_type_Full
799620,1941,9860.0,2.0,20160522.0,0.166667,30.0,1.0,0.573034,0.166667,7.570959,9.196241,3.401197,0,1
383111,3381,11951.0,3.0,20160129.0,0.1,200.0,0.62349,2.887906,0.1,8.125927,9.38857,5.298317,0,1
416347,2146,11173.0,10.0,20160129.0,0.1,100.0,0.62349,4.500546,0.1,7.671361,9.321255,4.60517,0,1
349079,1569,5054.0,10.0,20160427.0,0.15,200.0,-0.222521,5.848542,0.15,7.358194,8.527935,5.298317,0,1
819042,760,2418.0,5.0,20160602.0,0.166667,30.0,0.222521,1.512584,0.166667,6.633318,7.790696,3.401197,0,1
285994,3679,8784.0,8.0,20160129.0,0.1,100.0,0.62349,5.009607,0.1,8.210396,9.080687,4.60517,0,1
755455,8856,5495.0,0.0,20160614.0,0.05,20.0,-0.62349,0.131579,0.05,9.08885,8.611594,2.995732,0,1
387631,7019,3887.0,0.0,20160131.0,0.1,100.0,1.0,4.964653,0.1,8.856376,8.265393,4.60517,0,1
220579,3381,9481.0,0.0,20160123.0,0.1,300.0,0.900969,2.887906,0.1,8.125927,9.157045,5.703782,0,1
63305,4660,1480.0,10.0,20160125.0,0.1,100.0,-0.900969,6.239715,0.1,8.446771,7.299797,4.60517,0,1


In [24]:
trainNumber = len(target)
TrainSet = DataSet_dum[:trainNumber]
TestSet = DataSet_dum[trainNumber:]

In [25]:
x_train, x_test, y_train, y_test = train_test_split(TrainSet,target, test_size=0.2, random_state=42)

normalizer = StandardScaler()

normalizer.fit(DataSet_dum)

x_train_n, x_test_n = normalizer.transform(x_train), normalizer.transform(x_test)

In [28]:
x_test_n

array([[ 1.03209921e+00,  1.72858845e+00, -1.21880039e-03, ...,
         6.46282416e-01, -1.80608106e-01,  1.80608106e-01],
       [ 9.61933938e-01,  1.35017930e-01,  2.70242505e-01, ...,
         3.49168451e-02, -1.80608106e-01,  1.80608106e-01],
       [-2.36241258e-01, -1.19982497e+00,  1.89901034e+00, ...,
         1.61527392e+00, -1.80608106e-01,  1.80608106e-01],
       ...,
       [ 1.02384447e+00,  7.74889678e-01, -8.15602717e-01, ...,
         3.49168451e-02, -1.80608106e-01,  1.80608106e-01],
       [ 1.48652253e+00, -1.62864205e+00, -2.72680106e-01, ...,
        -4.15638546e-01, -1.80608106e-01,  1.80608106e-01],
       [-2.36241258e-01,  1.90356895e-01, -8.15602717e-01, ...,
         1.25764799e+00, -1.80608106e-01,  1.80608106e-01]])

In [29]:
RFCF = RandomForestClassifier(n_estimators=30, max_depth=21, min_samples_leaf=5, min_samples_split=5, random_state=1)

x=RFCF.fit(x_train_n, y_train)

y_pred = x.predict_proba(x_test_n)[:,1]

In [30]:
acc = metrics.mean_squared_error(y_pred, y_test)

print("Accuracy : %.5f"%acc)

Accuracy : 0.04404


# Test Data Prediction

In [31]:
TrainSet_n = normalizer.transform(TrainSet)

TestSet_n = normalizer.transform(TestSet)

In [32]:
#查詢內含超參數
from pprint import pprint
print('Parameters currently in use:\n')
pprint(RFCF.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 21,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 30,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [33]:
import numpy as np
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 10)] #np.linspace 在指定的间隔内返回均匀间隔的数字。

max_depth = [int(x) for x in np.linspace(10, 150, num = 11)]
max_depth.append(None)

min_samples_leaf = [1, 2, 4, 9]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth,  min_samples_leaf=min_samples_leaf)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(RFCF, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)



Fitting 3 folds for each of 480 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
# 印出最佳結果與最佳參數
#print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
#grid_result.best_params_

In [None]:
# 使用最佳參數重新建立模型
#RFCF_bestparam= RandomForestClassifier(n_estimators=grid_result.best_params_['n_estimators'],  #引用best_params_['']
#                                          max_depth=grid_result.best_params_['max_depth'],
#                                          min_samples_leaf = grid_result.best_params_['min_samples_leaf'] ,random_state=42)
# 訓練模型
#RFCF_bestparam.fit(x_train, y_train)

# 預測測試集
#y_pred = RFCF_bestparam.predict(x_test)

In [34]:
 LRCF = LogisticRegression(tol=0.001, C=1.0, max_iter=1000, l1_ratio=0.5, penalty='l1')
 LRCF_proba = LRCF.fit(TrainSet_n, target)
 y_predLRCF = LRCF_proba.predict_proba(TestSet_n)[:,1]
 print("Logictic Regression Done")

 RFCF = RandomForestClassifier(n_estimators=300, max_depth=21, min_samples_leaf=9, min_samples_split=5, random_state=1)
 RFCF_proba = RFCF.fit(TrainSet_n, target)
 y_predRFCF = RFCF_proba.predict_proba(TestSet_n)[:,1]
 print("Random Forest Done")

 GBCF = GradientBoostingClassifier(n_estimators=150, subsample=0.8, min_samples_leaf=5, max_depth=7,  tol=0.001,verbose=1, learning_rate=0.01)
 GBCF_proba = GBCF.fit(TrainSet_n, target)
 y_predGBCF = GBCF_proba.predict_proba(TestSet_n)[:,1]
 print("Gradient Boosting Done")

  "(penalty={})".format(self.penalty))


Logictic Regression Done
Random Forest Done
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.4367           0.0028           31.65m
         2           0.4341           0.0026           31.42m
         3           0.4312           0.0025           31.05m
         4           0.4293           0.0025           31.06m
         5           0.4281           0.0022           30.78m
         6           0.4242           0.0023           30.62m
         7           0.4228           0.0021           30.52m
         8           0.4208           0.0020           30.31m
         9           0.4190           0.0019           30.12m
        10           0.4179           0.0019           29.85m
        20           0.4012           0.0014           27.56m
        30           0.3900           0.0011           25.33m
        40           0.3790           0.0009           23.16m
        50           0.3707           0.0007           21.02m
        60           0.36

In [37]:
y_pred = y_predGBCF*0.1 + y_predLRCF*0.05 + y_predRFCF*0.85
#y_pred = y_predRFCF*1.00

In [38]:
 uid_list = list(uid.values)
 output = pd.DataFrame({'uid':uid_list, 'label':y_pred})
 out = output.groupby("uid", as_index=False).mean()
 out.to_csv("outcome.csv",index=False)