In [None]:
# 查看当前挂载的数据集目录
!ls /home/kesci/input/

In [None]:
# 查看个人持久化工作区文件
!ls /home/kesci/work/

In [None]:
# 查看当前kernerl下的package
!pip list --format=columns

In [None]:
# 显示cell运行时长
%load_ext klab-autotime

In [None]:
# lgb模型

In [None]:
# 存放读取特征并将所读取的所有特征拼接成一个dataframe的函数,
# 注意在featurecol_h5中控制训练所用特征列
# 注意在featurecol_map中控制读取特征特征列的位置

# 数据转换函数工具

# 计算qauc函数工具

In [None]:
# -*- coding: utf-8 -*- 
import pandas as pd
import numpy as np
import gc

# 显示cell运行时长
%load_ext klab-autotime

featurecol_h5={
    'cross_feat':['query_in_title','query_title_pos'],
    'query_pos_feat':['query_pos_1', 'query_pos_2', 'query_pos_3',
                      'query_pos_4', 'query_pos_5', 'query_pos_6', 
                      'query_pos_7', 'query_pos_8', 'query_pos_9', 
                      'query_pos_10'
                      ],
    'sim_feat':['jaccard_q3_t3',  
                'jaccard_q3_t5',  
                'jaccard_q5_t5',   'levenshtein_q5_t5', 
                'jaccard_q5_t10',  'levenshtein_q5_t10', 
                'jaccard_q10_t10', 'levenshtein_q10_t10', 
                'jaccard_q15_t25', 'levenshtein_q15_t25',
                'jaccard',         'levenshtein'],

    'fuzz' :['token_sort_ratio','token_set_ratio',
             'partial_ratio','partial_token_sort_ratio',
             'partial_token_set_ratio', 'QRatio','WRatio'],
    
    'textpair':['total_unique_words','wc_ratio_unique',
                'wc_diff_unique','token_set_diff','same_start'],
    
    'len_feat':["titlekw_num", "titlekw_querykw_diff",
                "titlekw_querykw_sum", "titlekw_querykw_rate"],
    
    "nunique_feat":["title_nunique_query"],
    
    'title_score_count_feat':["title_score_count","title_score_click_num","title_click_rate"],
    'title_code_score_feat':["title_code_score"],
    'title_convert_feat':["title_code_convert","title_code_label_count"],
    'count_feat':["title_count"],
    
    'tag' :['tag'],
    'tag_score_feat':["tag_score"],
    'tag_convert_feat':["tag_convert","tag_label_count"],
    
    "match_feat":['count_match', 'blockcount_match', 'proximity', 'maxMatchBlockLen',
                  'q1_match_start', 'q1_match_end' ],
    
    'editdistance_relativepos':['editdistance','relative_pos'],
    
    "BM25":["BM25"],
    'sif_feat':["sif_cos"],
    'NN_SIM':['NN_SIM'],
    
    'sen_dis':['sent_cosine', 'sent_cityblock', 
                'sent_canberra', 'sent_euclidean', 'sent_minkowski',
                'sent_braycurtis'],
    
    'sen_dis2':['skew_q','skew_t','kurtosis_q','kurtosis_t'],
    
}

# 该函数功能是通过改变类型的方式降低pd中内存
def reduce_mem_usage(D,verbose=True):
    start_mem = D.memory_usage().sum() / 1024**2
    for c, d in zip(D.columns, D.dtypes):
        if d.kind == 'f':
            D[c] = pd.to_numeric(D[c], downcast='float')
        elif d.kind == 'i':
            D[c] = pd.to_numeric(D[c], downcast='signed')
    end_mem = D.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return D

# def ReadData(datatype='train',nrows=40000000):
def ReadData(datatype='train',start=0,nrows=100000000):
    if datatype=='train':
        id_feature='/home/kesci/input/bytedance/train_final.csv'
        usecols=[0,4]
        names=['query_id','label']
        
        print("正在读取：",id_feature)
        DataSet = pd.read_csv(id_feature,
                              header=None,
                              skiprows=start,
                              nrows=nrows,
                              usecols=usecols,
                              names=names
                              )
        path_h5="/home/kesci/work/pre_3billion_data/train/"
    elif datatype=='test1':
        id_feature='/home/kesci/input/bytedance/test_final_part1.csv'
        usecols=[0,2]
        names=['query_id','query_title_id']
        path_h5="/home/kesci/work/post_4kw_data/test1/"
        print("正在读取：",id_feature)
        DataSet = pd.read_csv(id_feature,
                              header=None,
                              skiprows=start,
                              nrows=nrows,
                              usecols=usecols,
                              names=names
                              )
    elif datatype=='test2':
        id_feature='/home/kesci/input/bytedance/bytedance_contest.final_2.csv'
        usecols=[0,2]
        names=['query_id','query_title_id']
        path_h5="/home/kesci/work/post_4kw_data/test2/"
        print("正在读取：",id_feature)
        DataSet = pd.read_csv(id_feature,
                              header=None,
                              skiprows=start,
                              nrows=nrows,
                              usecols=usecols,
                              names=names
                              )

    print("length:",DataSet.__len__())
    DataSet = reduce_mem_usage(DataSet, verbose=True)
    
    featuremap_h5={
        'cross_feat':path_h5+f'cross_{datatype}_feat.h5',
    
        'query_pos_feat':path_h5+f'query_pos_{datatype}_feat.h5',
        'title_pos_feat':path_h5+f'title_pos_{datatype}_feat.h5',
    
        'match_feat':path_h5+f'query_match_{datatype}_feat.h5',
        'editDistance_feat':path_h5+f'editDistance_{datatype}_feat.h5',
    
        'sim_feat':path_h5+f'sim_{datatype}_feat.h5',
        'tag_score_feat':path_h5+f'tag_score_10foldtime_{datatype}_feat.h5',
        'title_score_count_feat':path_h5+f'title_score_count_{datatype}_feat.h5',
        'title_code_score_feat':path_h5+f'title_code_score_10foldtime_{datatype}_feat.h5',
        'title_convert_feat':path_h5+f'title_convert_{datatype}.h5',
        'sif_feat':path_h5+f'sif_{datatype}_post_4kw.h5',
        'len_feat':path_h5+f'len_{datatype}_feat.h5',
        'count_feat':path_h5+f'count_feature_{datatype}.h5',
        "nunique_feat":path_h5+f'nunique_feature_{datatype}.h5',
    
        'tag' :path_h5+f'tag_{datatype}.h5',
        "tag_convert_feat":path_h5+f"/tag_convert_{datatype}.h5",
        "query_convert":path_h5+f"query_convert_{datatype}.h5",
    
        "M_cosine":path_h5+f"M_sim_{datatype}_feat.h5",
        "M_tfidf_cosine":path_h5+f"M_tfidf_sim_{datatype}_feat.h5",
        "BM25":path_h5+f'BM25_{datatype}_feat.h5',
        'NN_SIM':path_h5+f'nn_sim_feature.h5',
    
        'editdistance_relativepos':path_h5+f'editdistance_relativepos_{datatype}_feat.h5',
        'fuzz' :path_h5+f"fuzz_{datatype}_feat.h5",
        'textpair':path_h5+f"textpair_{datatype}_feat.h5",
        
        'sen_dis':path_h5+f"sen_dis_{datatype}_200.h5",
        'sen_dis2':path_h5+f"sen_dis2_{datatype}_200.h5",
    }

    for featurefile in featurecol_h5:
        print("正在读取：",featuremap_h5[featurefile])
        feature_set=pd.read_hdf(featuremap_h5[featurefile], key='data',start=start,stop=start+nrows).reset_index(drop=True)
        print("length:",feature_set.__len__())
        # print(feature_set.head(1))
        # feature_set=reduce_mem_usage(feature_set, verbose=True)
        DataSet=pd.concat([DataSet,feature_set], axis=1)
    
    # DataSet = reduce_mem_usage(DataSet, verbose=True)
    print("Data Read Finish!")
    return DataSet

# *************************************** 数据转换 ************************************
def getCalcfeat(FeatureData):
    features=[c for c in FeatureData.columns]

    if "querykw_num" in features and "titlekw_num" in features:
        print("计算长度比率特征..")
        FeatureData["titlekw_querykw_rate"]=FeatureData["titlekw_num"]/FeatureData["querykw_num"]
        FeatureData["titlekw_querykw_diff"]=FeatureData["titlekw_num"]-FeatureData["querykw_num"]
        FeatureData["titlekw_querykw_sum"]=FeatureData["titlekw_num"]+FeatureData["querykw_num"]
    
    if "title_score_count" in features and "title_score_click_num" in features:
        print("计算title点击率特征..")
        FeatureData["title_click_rate"]=0
        FeatureData.loc[FeatureData.title_score_count!=0,"title_click_rate"]= \
            FeatureData.loc[FeatureData.title_score_count!=0,"title_score_click_num"]/ \
            FeatureData.loc[FeatureData.title_score_count!=0,"title_score_count"]

    FeatureData = reduce_mem_usage(FeatureData, verbose=True)
    print("calc feat done")
    return FeatureData

def preDataXGBoost(FeatureData,hasLabel=True,datatype=None):
    # 该函数的功能是进行是将pandas数据转换为numpy.array
    FeatureData=getCalcfeat(FeatureData)
    if hasLabel:
        if datatype is not None:
            group_path = f'/home/kesci/work/pre_3billion_data/groups_{datatype}.npy'
            group=np.load(group_path)
            target=FeatureData.label.get_values()
        else:
            print("error! Please set datatype...")
    cols=[]
    for featurefile in featurecol_h5:
        cols=cols+featurecol_h5[featurefile]
    
    featuredata = FeatureData[cols].get_values()

    if hasLabel:
        return group,featuredata,target
    else:
        return featuredata

# *************************************** 计算auc源码 ************************************
def calAUC(labels,prob):
    f = list(zip(prob,labels))
    rank = [values2 for values1,values2 in sorted(f,key=lambda x:x[0])]
    rankList = [i+1 for i in range(len(rank)) if rank[i]==1]
    posNum = 0
    negNum = 0
    for i in range(len(labels)):
        if(labels[i]==1):
            posNum+=1
        else:
            negNum+=1
    auc = (sum(rankList)- (posNum*(posNum+1))/2)/(posNum*negNum)
    return auc

# 计算各个组的qauc值
def sumAUC(mycombinedata):
    grouplist=mycombinedata[0]
    y_true=mycombinedata[1]
    y_pred=mycombinedata[2]

    if len(y_true)!=sum(grouplist):
        print("评分函数中len(y_true)!=sum(group)")
        return
    start=0
    sum_AUC=0
    for group in grouplist:
        if 0 in y_true[start:start+group] and 1 in y_true[start:start+group]:
            roc_auc = calAUC(y_true[start:start+group],y_pred[start:start+group])
            # roc_auc=auc(fpr,tpr) ###计算auc的值
        else:
            roc_auc=0.5
        start=start+group
        sum_AUC=sum_AUC+roc_auc
    return sum_AUC

from joblib import Parallel, delayed
from sklearn.metrics import roc_curve, auc
def qAUC(y_true,y_pred,group):
    groupnum=16
    import math
    group_len=math.ceil(len(group)/groupnum)
    groups=[group[i*group_len:(i+1)*group_len] for i in range(groupnum)]
    mycombines=[]

    start=0
    for agroup in groups:
        mycombinedata=[]
        mycombinedata.append(agroup)
        mycombinedata.append(y_true[start:start+sum(agroup)])
        mycombinedata.append(y_pred[start:start+sum(agroup)])
        start=start+sum(agroup)
        mycombines.append(mycombinedata)

    sum_AUC=Parallel(n_jobs=groupnum)(delayed(sumAUC)(mycombinedata) for mycombinedata in mycombines)

    return "qAUC",sum(sum_AUC)/len(group)


In [None]:
# 读取数据,并将数据划分数据集和验证集

In [None]:
#  读取处理成特征的数据集（只使用了后 1500000 query_id 的数据）
train_query_title = ReadData(datatype='train',nrows=100000000)
print("数据读取完毕")

train_query_title.title_code_score=train_query_title.title_code_score.fillna(0)
train_query_title.tag_score=train_query_title.tag_score.fillna(0)

# 将前500w个query_id 作为验证集，后350w数据集作为训练集

queryids=list(range(1,train_query_title.query_id.max()))
val_queryids = queryids[:1000000]
train_queryids = queryids[1000000:]

val_query_title=train_query_title.loc[train_query_title.query_id.isin(val_queryids)]
print("val len:",val_query_title.__len__())
train_query_title=train_query_title.loc[train_query_title.query_id.isin(train_queryids)]
print("train len:",train_query_title.__len__())

del queryids,train_queryids,val_queryids
gc.collect()

print("测试集验证集划分完毕")

In [None]:
# 对训练集按照queryid分组放入lgb.Dataset()中

In [None]:
import gc
Train_flag=True
Valdation_Flag=True
import lightgbm as lgb

if Train_flag:
    # dgroup,dtrain,dtarget = preDataXGBoost(train_query_title,hasLabel=True,datatype="train")
    datatype="train"
    group_path = f'/home/kesci/work/pre_3billion_data/groups_{datatype}.npy'
    train_data = lgb.Dataset(data=preDataXGBoost(train_query_title,hasLabel=False), 
                             label=train_query_title.label.get_values(),
                             group=np.load(group_path))
    del train_query_title
    gc.collect()
    print("train OK")

if Valdation_Flag:
    # dvalgroup,dval,dvaltarget = preDataXGBoost(val_query_title,hasLabel=True,datatype="validation")
    datatype="validation"
    group_path = f'/home/kesci/work/pre_3billion_data/groups_{datatype}.npy'
    
    val_data = lgb.Dataset(data=preDataXGBoost(val_query_title,hasLabel=False), 
                           label=val_query_title.label.get_values(), 
                           group=np.load(group_path),
                           reference=train_data)
    
    del val_query_title
    gc.collect()
    print("test OK")
print("数据转换完毕")

gc.collect()

In [None]:
# lgb模型训练

In [None]:
import lightgbm as lgb
import time

lgb_rank_params = {    
    'boosting_type' : 'gbdt', 
    'objective' : 'lambdarank',
    'metric': 'map',
    'random_state' : 2019,
    'n_jobs' : 13,
    'num_leaves' : 195,
    'max_depth' : 12,
    'learning_rate':0.05,
    'max_bin':200,
    'subsample_for_bin':200000,
    'min_split_gain':0.0,
    'min_child_weight':0.001,
    'min_child_samples':20,
    'subsample':0.8,
    'subsample_freq':1,
    'colsample_bytree':0.8,
    'reg_alpha':3.0,
    'reg_lambda':2.0,
    
    # 'device':'gpu',
    # 'gpu_platform_id':0,
    # 'gpu_device_id':0
}
valdation_flag = True

if valdation_flag:
    print("Train 开始")
    start=time.time()
    rankModel = lgb.train(lgb_rank_params, train_data, num_boost_round=3000,\
        valid_sets=[train_data,val_data],
        verbose_eval=1,
        early_stopping_rounds=300,
        )
    print("训练完成，训练时间为：",time.time()-start)
    
    rankModel.save_model("/home/kesci/work/LGBModel/LGBRank_pre1e_807_final_baseline_3000_1.model")
    print("save model finish.")
    
    start=time.time()
    test_pred=rankModel.predict(dval)
    print("预测完成，预测时间为：",time.time()-start)
    
    start=time.time()
    score_cur=qAUC(dvaltarget,test_pred,dvalgroup)
    print("计算qAUC完成，计算时间为：",time.time()-start)
    print("qAUC值为：",score_cur)


In [None]:
# 模型test1集预测

In [None]:
print("begin")

test_query_title = ReadData(datatype='test1')
test_query_title.title_code_score=test_query_title.title_code_score.fillna(0)
test_query_title.tag_score=test_query_title.tag_score.fillna(0)
print("数据读取完毕")

dtest = preDataXGBoost(test_query_title,hasLabel=False)
print("dtest OK")

from lightgbm import Dataset,Booster
rankModel=Booster(model_file="/home/kesci/work/LGBModel/LGBRank_pre1e_807_final_baseline_3000_1.model")

test_pred=rankModel.predict(dtest)
print("test_pred OK")

submission=test_query_title[["query_id","query_title_id"]]
submission["prediction"]=pd.DataFrame(test_pred)

SaveFile="/home/kesci/work/Submission/LGBRank_pre1e_807_final_baseline_3000_1.csv"
submission.to_csv(path_or_buf=SaveFile,header=False,index=0,encoding="utf-8")
print("submission OK")