In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score,auc,confusion_matrix,f1_score, \
    precision_score,recall_score,roc_curve
from imblearn.over_sampling import SMOTE
import prettytable
from datetime import date
import matplotlib.pyplot as plt

In [2]:
dfoff = pd.read_csv('./data/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('./data/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('./data/ccf_online_stage1_train.csv')

dfsub = pd.DataFrame({'User_id':'用户ID',
                      'Coupon_id':'优惠券ID',
                      'Date_received':'领取优惠券日期',
                      'Probability':'预测值'},index=[1])

# 只用线下数据，不做特征，找几个基本的模型，然后提交

## 1、预处理
正负样本标签(优惠券消费1，普通消费-1，领券不消费0)  
过采样  
训练和测试子集划分   
还是要处理一下特征，优惠力度、消费日期等    

### 1.1、打标签并且划出优惠券相关集
这个子集要怎么划分呢？题目是预测领取优惠券后是否核销，也就是说全部是1（用券消费）和0（有券不消费），不存在-1（普通消费）  
那么训练子集也要非0

In [3]:
#def get_label_off(dfoff):
#    """给样本打标签"""
#    # 拿券，消费
#    dfoff.loc[(dfoff.Coupon_id.notna()) & (dfoff.Date.notna()),'label'] = 1
#    # 拿券，不消费
#    dfoff.loc[(dfoff.Coupon_id.notna()) & (dfoff.Date.isna()),'label'] = 0
#    #不拿券，消费
#    dfoff.loc[(dfoff.Coupon_id.isna()) & (dfoff.Date.notna()),'label'] = -1
#    #不拿券，不消费
#    dfoff.loc[(dfoff.Coupon_id.isna()) & (dfoff.Date.isna()),'label'] = 'error'
#    return dfoff

In [4]:
##打标签
#dfoff = get_label_off(dfoff)
##划分子集
#dfall = dfoff[dfoff.label!=-1]

In [5]:
#dfoff.label.value_counts()
##看起来，整体核销率就是，5%左右

### 1.2、处理特征


#### 1.2.1 Discount_rate
1. 将满xx减yy类型(`xx:yy`)的券变成折扣率 : `1 - yy/xx`，同时建立折扣券相关的特征 `discount_rate, discount_man, discount_jian, discount_type`
2. 将距离 `str` 转为 `int`
convert Discount_rate and Distance

In [6]:

def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    # convert discunt_rate
    df.loc[:,'discount_rate'] = df.loc[:,'Discount_rate'].apply(convertRate)
    df.loc[:,'discount_man'] =  df.loc[:,'Discount_rate'].apply(getDiscountMan)
    df.loc[:,'discount_jian'] = df.loc[:,'Discount_rate'].apply(getDiscountJian)
    df.loc[:,'discount_type'] = df.loc[:,'Discount_rate'].apply(getDiscountType)
    # print(df['discount_rate'].unique())
    # convert distance
    df['Distance'] = df['Distance'].fillna(2).astype(int)
    return df

In [7]:
dfoff = processData(dfoff)

In [8]:
dftest = processData(dftest)

#### 1.2.2 领券以及核销日期

In [6]:
#date_received = dfoff['Date_received'].unique()
#date_received = sorted(date_received[pd.notnull(date_received)])

#date_buy = dfoff['Date'].unique()
#date_buy = sorted(date_buy[pd.notnull(date_buy)])
#date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])


In [10]:
#couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
#couponbydate.columns = ['Date_received','count']
#buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
#buybydate.columns = ['Date_received','count']

In [11]:
#这俩不是一样的吗

In [12]:
##统计不同领券日期，各自核销的订单数
#couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
#couponbydate.columns = ['Date_received','count']

In [13]:
#buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
## 优惠券消费
#buybydate.columns = ['Date_received','count']

#### 1.2.3 周末

In [9]:
def getWeekday(row):
    if row == 'nan':
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

# 领券是周几,没券就nan
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )

# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf


def label(row):
    """无券-1；有券且15天内消费1；其他0（包括有券不消费、有券15天后消费）"""
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff['label'] = dfoff.apply(label, axis = 1)


### 1.3 分割数据集

In [10]:
dfall = dfoff[dfoff['label'] != -1].copy()
dfall.Date.fillna(0,inplace=True)

#### 1.3.1 过采样
1、Discount_rate无法识别（150：20）  
2、不能有nan( Distance Date_received )  
不能有nan,但是Date_received必须要保留nan  


In [11]:
#过采样，注意Discount_rate和nan
model_smote = SMOTE()
dfall_X, dfall_y = model_smote.fit_sample(dfall.drop(['label','Discount_rate'],axis = 1),dfall.label)  # 输入数据并作过抽样处理
dfall = pd.concat([dfall_X,dfall_y],axis=1)

In [12]:
train = dfall[(dfall['Date_received'] < 20160516)].copy()
valid = dfall[(dfall['Date_received'] >= 20160516) & (dfall['Date_received'] <= 20160615)].copy()

In [13]:
#original_feature = ['discount_rate','discount_man', 'discount_jian','Distance','weekday_type']
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','Distance', 'weekday', 'weekday_type'] + weekdaycols

x_train,y_train = train[original_feature],train.label
x_test,y_test = valid[original_feature],valid.label

### 1.4 模型以及得分

In [14]:
def get_model_scores(model,x_test,y_test,pre_y):
    
    #print(model.score(x_test, valid['label']))
    
    tn, fp, fn, tp = confusion_matrix(y_test, pre_y).ravel()  # 获得混淆矩阵
    confusion_matrix_table = prettytable.PrettyTable(['','prediction-0','prediction-1'])  # 创建表格实例
    confusion_matrix_table.add_row(['actual-0',tp,fn])  # 增加第一行数据
    confusion_matrix_table.add_row(['actual-1',fp,tn])  # 增加第二行数据
    #print('confusion matrix \n',confusion_matrix_table)
    
    # 核心评估指标
    #y_test = valid.label
    #x_test = valid[original_feature]
    y_score = model.predict_proba(x_test)  # 获得决策树的预测概率
    fpr, tpr, _ = roc_curve(y_test, y_score[:, 1])  # ROC
    auc_s = auc(fpr, tpr)  # AUC
    scores = [round(i(y_test, pre_y),3 )for  i in (accuracy_score,precision_score,\
                                         recall_score,f1_score)]
    scores.insert(0,auc_s)
    core_metrics = prettytable.PrettyTable()  # 创建表格实例
    core_metrics.field_names = ['auc', 'accuracy', 'precision', 'recall', 'f1']  # 定义表格列名
    core_metrics.add_row(scores)  # 增加数据
    print('core metrics\n',core_metrics)
    return auc_s

In [38]:
# 模型一
# feature
#original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','Distance', 'weekday', 'weekday_type'] + weekdaycols
print("----train-----")
model = SGDClassifier(#lambda:
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)
model.fit(x_train, y_train)
print("---pred------")

## #### 预测以及结果评价
pre_y = model.predict(x_test)
get_model_scores(model,x_test,y_test,pre_y)

----train-----
---pred------
core metrics
 +-------------------+----------+-----------+--------+-------+
|        auc        | accuracy | precision | recall |   f1  |
+-------------------+----------+-----------+--------+-------+
| 0.861600576925673 |  0.696   |   0.649   | 0.881  | 0.748 |
+-------------------+----------+-----------+--------+-------+


0.861600576925673

In [28]:
#for i in range(80,121,10):
#    model = SGDClassifier(loss='log',penalty='elasticnet',fit_intercept=True,max_iter=i,shuffle=True,\
#                          alpha = 0.01,l1_ratio = 0.01,n_jobs=1,class_weight=None)
#    model.fit(x_train, y_train)
#    pre_y = model.predict(x_test)
#    cc = get_model_scores(model,x_test,y_test,pre_y)
#    print('-----',i,cc)

In [32]:
# 模型2
# XGB分类模型训练
param_dist = {'objective': 'binary:logistic', 'n_estimators': 17,
              'subsample': 0.8, 'max_depth': 13, 'n_jobs': -1}
model_xgb = xgb.XGBClassifier(**param_dist)
#model_xgb = xgb.XGBClassifier()
model_xgb.fit(x_train, y_train)

pre_y = model_xgb.predict(valid[original_feature])
get_model_scores(model_xgb,x_test,y_test,pre_y)

core metrics
 +--------------------+----------+-----------+--------+-------+
|        auc         | accuracy | precision | recall |   f1  |
+--------------------+----------+-----------+--------+-------+
| 0.9413959007668731 |  0.899   |   0.946   | 0.851  | 0.896 |
+--------------------+----------+-----------+--------+-------+


0.9413959007668731

In [44]:
# 先这样调参吧，非常简单的特征

In [45]:
subsample_list = [0.8]
n_estimators_list = [10,13,15,17]
max_depth_list = [9,11,13,15]

In [46]:
x,y,z,sc = [],[],[],[]
for sub in subsample_list:
    for est in n_estimators_list:
        for dep in max_depth_list:
            param_dist = {'objective': 'binary:logistic', 'n_estimators': est,
              'subsample': sub, 'max_depth': dep, 'n_jobs': -1}
            model_xgb = xgb.XGBClassifier(**param_dist)
            model_xgb.fit(x_train, y_train)
            
            pre_y = model_xgb.predict(x_test)
            tsc = get_model_scores(model_xgb,x_test,y_test,pre_y)
            
            x.append(sub)
            y.append(est)
            z.append(dep)
            sc.append(tsc)
            print(sub,est,dep,tsc)

core metrics
 +--------------------+----------+-----------+--------+-------+
|        auc         | accuracy | precision | recall |   f1  |
+--------------------+----------+-----------+--------+-------+
| 0.9397281188861489 |  0.897   |   0.943   | 0.849  | 0.894 |
+--------------------+----------+-----------+--------+-------+
0.8 10 9 0.9397281188861489
core metrics
 +--------------------+----------+-----------+--------+-------+
|        auc         | accuracy | precision | recall |   f1  |
+--------------------+----------+-----------+--------+-------+
| 0.9414163268385572 |   0.9    |   0.946   | 0.853  | 0.897 |
+--------------------+----------+-----------+--------+-------+
0.8 10 11 0.9414163268385572
core metrics
 +--------------------+----------+-----------+--------+-------+
|        auc         | accuracy | precision | recall |   f1  |
+--------------------+----------+-----------+--------+-------+
| 0.9411118653137099 |  0.901   |   0.947   | 0.852  | 0.897 |
+------------------

In [47]:
pd.DataFrame([x,y,z,sc]).T.sort_values(3)

Unnamed: 0,0,1,2,3
0,0.8,10.0,9.0,0.939728
2,0.8,10.0,13.0,0.941112
3,0.8,10.0,15.0,0.941305
1,0.8,10.0,11.0,0.941416
4,0.8,13.0,9.0,0.941591
7,0.8,13.0,15.0,0.941838
8,0.8,15.0,9.0,0.941886
5,0.8,13.0,11.0,0.942038
6,0.8,13.0,13.0,0.942218
9,0.8,15.0,11.0,0.942329


In [51]:
x_train.Distance.mean()

1.977877196352052

In [39]:
model.fit(dfall[original_feature],dfall.label)

SGDClassifier(alpha=0.01, l1_ratio=0.01, loss='log', max_iter=100, n_jobs=1,
              penalty='elasticnet')

In [41]:
# 今天就酱，先提交

# test prediction for submission
final_model = model
y_test_pred = final_model.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit_m1_all.csv', index=False, header=False)
dftest1.head()

Unnamed: 0,User_id,Coupon_id,Date_received,label
0,4129537,9983,20160712,0.55164
1,6949378,3429,20160706,0.471232
2,2166529,6928,20160727,0.066245
3,2166529,1808,20160727,0.164736
4,6172162,6500,20160708,0.435161


In [43]:
y_test_pred_1 = model.predict_proba(dftest[original_feature])
y_test_pred_2 = model_xgb.predict_proba(dftest[original_feature])

array([[0.4483597 , 0.5516403 ],
       [0.52876836, 0.47123164],
       [0.93375519, 0.06624481],
       ...,
       [0.56710119, 0.43289881],
       [0.53773179, 0.46226821],
       [0.70233485, 0.29766515]])

In [74]:
dfscores = pd.DataFrame([y_test_pred_1[:,1],y_test_pred_2[:,1]]).T

In [77]:
def ls(x):
    if x[0]>x[1]:
        max = x[0]
        min = x[1]
    else:
        max = x[1]
        min = x[0]
    return  max

def ls2(x):
    if x[0]>x[1]:
        max = x[0]
        min = x[1]
    else:
        max = x[1]
        min = x[0]
    return  min

dfscores.loc[:,'max'] = dfscores.apply(ls,axis=1)
dfscores.loc[:,'min'] = dfscores.apply(ls2,axis=1)

In [83]:
y_test_pred = dfscores.loc[:,'min']
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred
dftest1.to_csv('submit_m12_all_min.csv', index=False, header=False)
dftest1.head()

Unnamed: 0,User_id,Coupon_id,Date_received,label
0,4129537,9983,20160712,0.142272
1,6949378,3429,20160706,0.121859
2,2166529,6928,20160727,0.008821
3,2166529,1808,20160727,0.021716
4,6172162,6500,20160708,0.129874


### 1.3 过采样
正样本比例过少，需要过采样

In [None]:
dfall.label.value_counts()
X = dfall.iloc[:,:-1]
y = dfall.iloc[:,-1]

In [None]:
model_smote = SMOTE()
#X,y = model_smote.fit_sample(X,y)

# 记录
0.5514 m1_half  
0.5493 m2_half  
0.5536 m2_all  
0.5553 m1_all  
0.5536 submit_m12_all_max.
0.5542 submit_m12_all_min.csv