In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# 一、特征工程

## 1、合并训练集和测试集

In [2]:
def merge_data():
    # 标签
    train_clicks = pd.read_csv("data_set_phase1//train_clicks.csv")

    # 特征数据集
    train_plans = pd.read_csv("data_set_phase1//train_plans.csv")
    train_queries = pd.read_csv("data_set_phase1//train_queries.csv")
    test_plans = pd.read_csv("data_set_phase1//test_plans.csv")
    test_queries = pd.read_csv("data_set_phase1//test_queries.csv")

    # merge训练集
    tra_data = train_queries.merge(train_plans, on='sid', how='left')
    tra_data = tra_data.merge(train_clicks, on='sid', how='left')
    tra_data = tra_data.drop(['click_time'], axis=1)
    tra_data['click_mode'] = tra_data['click_mode'].fillna(0)

    # merge测试集
    tes_data = test_queries.merge(test_plans, on='sid', how='left')
    tes_data['click_mode'] = -1

    # concat训练集和测试集
    all_data = pd.concat([tra_data, tes_data], axis=0)
    all_data = all_data.drop(['plan_time'], axis=1)
    all_data = all_data.reset_index(drop=True)
    
    return all_data

## 2、抽取o、d的特征

In [3]:
def gen_od_feature(all_data):
    all_data['o1'] = all_data['o'].apply(lambda x : float(x.split(',')[0]))
    all_data['o2'] = all_data['o'].apply(lambda x : float(x.split(',')[1]))
    all_data['d1'] = all_data['d'].apply(lambda x : float(x.split(',')[0]))
    all_data['d2'] = all_data['d'].apply(lambda x : float(x.split(',')[1]))
    all_data = all_data.drop(['o', 'd'], axis=1)
    return all_data

## 3、抽取plans的特征

### 提取plans特征
### 1、max_distance、min_distance、mean_distance、std_distance
### 2、max_price、min_price、mean_price、std_price
### 3、max_eta、min_eta、mean_eta、std_eta
### 4、max_dis_mode、min_dis_mode、max_price_mode、min_price_mode、max_eta_mode、min_eta_mode
### 5、first_mode

In [4]:
def gen_plan_feature(all_data):
    n = all_data.shape[0]
    
    # 初始化推荐给用户的plans，类似于one-hot编码，推荐了哪一个mode，就置为1
    mode_list_feas = np.zeros((n, 12))

    # 初始化最大距离、最小距离、平均距离、距离标准差
    max_distance, min_distance, mean_distance, std_distance = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大价格、最小价格、平均价格、价格标准差
    max_price, min_price, mean_price, std_price = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大用时、最小用时、平均用时、用时标准差
    max_eta, min_eta, mean_eta, std_eta = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化最大距离mode、最小距离mode、最大价格mode、最小价格mode、最大用时mode、最小用时mode、第一推荐mode
    max_dis_mode, min_dis_mode, max_price_mode, min_price_mode, max_eta_mode, min_eta_mode, first_mode = np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,)),np.zeros((n,))

    # 初始化推荐mode的顺序
    mode_texts=[]
    
    # 遍历每个用户的plan
    for i, plan in tqdm(enumerate(all_data['plans'].values)):
        try:
            user_plan_list = json.loads(plan)
        except:
            user_plan_list = []
        if len(user_plan_list)==0:
            mode_list_feas[i, 0] = 1

            first_mode[i] = 0

            max_distance[i] = -1
            min_distance[i] = -1
            mean_distance[i] = -1
            std_distance[i] = -1

            max_price[i] = -1
            min_price[i] = -1
            mean_price[i] = -1
            std_price[i] = -1

            max_eta[i] = -1
            min_eta[i] = -1
            mean_eta[i] = -1
            std_eta[i] = -1

            max_dis_mode[i] = -1
            min_dis_mode[i] = -1
            max_price_mode[i] = -1
            min_price_mode[i] = -1
            max_eta_mode[i] = -1
            min_eta_mode[i] = -1

            mode_texts.append('word_null')
        else:
            distance_list = []
            price_list = []
            eta_list = []
            mode_list = []

            # 抽取每个用户的每个plan
            for tmp_dict in user_plan_list:
                distance_list.append(int(tmp_dict['distance']))
                if tmp_dict['price']=='':
                    price_list.append(0)
                else:
                    price_list.append(int(tmp_dict['price']))
                eta_list.append(int(tmp_dict['eta']))
                mode_list.append(int(tmp_dict['transport_mode']))

            # 将每个用户的推荐模型按顺序添加
            mode_texts.append(' '.join(['word_{}'.format(mode) for mode in mode_list]))

            # 将list转换成ndarray
            distance_list = np.array(distance_list)
            price_list = np.array(price_list)
            eta_list = np.array(eta_list)
            mode_list = np.array(mode_list, dtype='int')
            
            # 将有plans推荐的用户的mode置为1
            mode_list_feas[i, mode_list] = 1

            # 获取索引
            distance_sort_idx = np.argsort(distance_list)
            price_sort_idx = np.argsort(price_list)
            eta_sort_idx = np.argsort(eta_list)

            # 构建特征
            max_distance[i] = distance_list[distance_sort_idx[-1]]
            min_distance[i] = distance_list[distance_sort_idx[0]]
            mean_distance[i] = np.mean(distance_list)
            std_distance[i] = np.std(distance_list)

            max_price[i] = price_list[price_sort_idx[-1]]
            min_price[i] = price_list[price_sort_idx[0]]
            mean_price[i] = np.mean(price_list)
            std_price[i] = np.std(price_list)

            max_eta[i] = eta_list[eta_sort_idx[-1]]
            min_eta[i] = eta_list[eta_sort_idx[0]]
            mean_eta[i] = np.mean(eta_list)
            std_eta[i] = np.std(eta_list)

            first_mode[i] = mode_list[0]

            max_dis_mode[i] = mode_list[distance_sort_idx[-1]]
            min_dis_mode[i] = mode_list[distance_sort_idx[0]]

            max_price_mode[i] = mode_list[price_sort_idx[-1]]
            min_price_mode[i] = mode_list[price_sort_idx[0]]

            max_eta_mode[i] = mode_list[eta_sort_idx[-1]]
            min_eta_mode[i] = mode_list[eta_sort_idx[0]]

    # 将特征存储进DataFrame中
    plan_feature_data = pd.DataFrame(mode_list_feas)
    plan_feature_data.columns = ['mode_feas_{}'.format(i) for i in range(12)]

    plan_feature_data['max_distance'] = max_distance
    plan_feature_data['min_distance'] = min_distance
    plan_feature_data['mean_distance'] = mean_distance
    plan_feature_data['std_distance'] = std_distance

    plan_feature_data['max_price'] = max_price
    plan_feature_data['min_price'] = min_price
    plan_feature_data['mean_price'] = mean_price
    plan_feature_data['std_price'] = std_price

    plan_feature_data['max_eta'] = max_eta
    plan_feature_data['min_eta'] = min_eta
    plan_feature_data['mean_eta'] = mean_eta
    plan_feature_data['std_eta'] = std_eta

    plan_feature_data['max_dis_mode'] = max_dis_mode
    plan_feature_data['min_dis_mode'] = min_dis_mode
    plan_feature_data['max_price_mode'] = max_price_mode
    plan_feature_data['min_price_mode'] = min_price_mode
    plan_feature_data['max_eta_mode'] = max_eta_mode
    plan_feature_data['min_eta_mode'] = min_eta_mode

    plan_feature_data['first_mode'] = first_mode

    # tiidf提取特征
    tfidf = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf.fit_transform(mode_texts)
    svd = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    mode_svd = svd.fit_transform(tfidf_vec)
    
    # 转换成dataframe
    mode_svd = pd.DataFrame(mode_svd)
    mode_svd.columns = ['svd_mode_{}'.format(i) for i in range(10)]

    all_data = pd.concat([all_data, mode_svd, plan_feature_data], axis=1)
    all_data = all_data.drop(['plans'], axis=1)
    
    return  all_data

# 4、抽取profiles数据集特征

In [20]:
def gen_profiles_feature(all_data):
    profiles = pd.read_csv("data_set_phase1//profiles.csv")

    # 用于填充没有pid的用户
    profiles_na = np.zeros(67)
    profiles_na[0] = -1
    profiles_na = pd.DataFrame(profiles_na.reshape(1, -1))
    profiles_na.columns = profiles.columns
    profiles = profiles.append(profiles_na)
    
    # 对特征进行奇异值分解，实现降维
#     pi = profiles.drop(['pid'], axis=1).values
#     svd = TruncatedSVD(n_components=60, n_iter=20, random_state=2019)
#     profiles_svd = svd.fit_transform(pi)
    
    # 转换成dataframe
#     profiles_svd = pd.DataFrame(profiles_svd)
#     profiles_svd.columns = ['svd_profiles_{}'.format(i) for i in range(60)]
#     profiles_svd['pid'] = profiles['pid'].values

    # 合并数据集
    all_data['pid'] = all_data['pid'].fillna(-1)
    all_data = all_data.merge(profiles, on='pid', how='left')
    return all_data

# 5、抽取时间特征（req_time）

In [13]:
def gen_time_feature(all_data):
    all_data['req_time'] = pd.to_datetime(all_data['req_time'])
    all_data['dayofweek'] = all_data['req_time'].dt.dayofweek
    all_data['hourofday'] = all_data['req_time'].dt.hour
    all_data = all_data.drop(['req_time'], axis=1)
    return all_data

# 6、切分训练集和测试集

In [7]:
def train_test_split(all_data):
    train_data = all_data[all_data['click_mode']!=-1]
    test_data = all_data[all_data['click_mode']==-1]
    test_data = test_data.drop(['click_mode'], axis=1)
    submit = test_data[['sid']].copy()
    
    train_data = train_data.drop(['sid','pid'], axis=1)
    train_y = train_data['click_mode']
    train_x = train_data.drop(['click_mode'], axis=1)
    test_x = test_data.drop(['sid','pid'], axis=1)
    
    return train_x, train_y, test_x, submit

In [18]:
all_data = merge_data()
all_data = gen_od_feature(all_data)
all_data = gen_plan_feature(all_data)
all_data = gen_profiles_feature(all_data)
all_data = gen_time_feature(all_data)
train_x, train_y, test_x, submit = train_test_split(all_data)

594358it [04:02, 2447.44it/s]


# 7、训练模型

In [19]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import f1_score
from time import gmtime, strftime
from sklearn.model_selection import StratifiedKFold

def eval_f(y_pred, train_data):
    y_true = train_data.label
    y_pred = y_pred.reshape(12, -1).T
    y_pred = np.argmax(y_pred, axis=1)
    f1 = f1_score(y_true, y_pred, average='weighted')
    return 'weighted-f1-score', f1, True

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)

lgb_paras = {
    'objective': 'multiclass',
    'metrics': 'multiclass',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'lambda_l1': 0.01,
    'lambda_l2': 10,
    'num_class': 12,
    'seed': 2019,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 4
}

categorical_feature = ['max_dis_mode', 'min_dis_mode', 'max_price_mode', 'min_price_mode',
                       'max_eta_mode', 'min_eta_mode', 'first_mode','dayofweek','hourofday']
scores = []
result_proba = []
for tra_idx, val_idx in kfold.split(train_x, train_y):
    tra_x, tra_y, val_x, val_y = train_x.iloc[tra_idx], train_y[tra_idx], train_x.iloc[val_idx], train_y[val_idx]
    train_set = lgb.Dataset(tra_x, tra_y, categorical_feature=categorical_feature)
    val_set = lgb.Dataset(val_x, val_y, categorical_feature=categorical_feature)
    lgb_model = lgb.train(lgb_paras, train_set, valid_sets=[val_set], early_stopping_rounds=50, num_boost_round=40000, verbose_eval=50, feval=eval_f)
    val_pred = np.argmax(lgb_model.predict(val_x, num_iteration=lgb_model.best_iteration), axis=1)
    val_score = f1_score(val_y, val_pred, average='weighted')
    result_proba.append(lgb_model.predict(test_x, num_iteration=lgb_model.best_iteration))
    scores.append(val_score)
print('cv f1_score:', np.mean(scores))
pred_test = np.argmax(np.mean(result_proba, axis=0), axis=1)

# 提交结果
now_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
submit['recommend_mode'] = pred_test
submit.to_csv('submission_{}.csv'.format(now_time), index=False)

  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 1.01795	valid_0's weighted-f1-score: 0.669467
[100]	valid_0's multi_logloss: 0.923867	valid_0's weighted-f1-score: 0.673266
[150]	valid_0's multi_logloss: 0.901965	valid_0's weighted-f1-score: 0.67495
[200]	valid_0's multi_logloss: 0.895609	valid_0's weighted-f1-score: 0.675337
[250]	valid_0's multi_logloss: 0.893235	valid_0's weighted-f1-score: 0.67557
[300]	valid_0's multi_logloss: 0.892134	valid_0's weighted-f1-score: 0.675779
[350]	valid_0's multi_logloss: 0.891532	valid_0's weighted-f1-score: 0.675988
[400]	valid_0's multi_logloss: 0.89113	valid_0's weighted-f1-score: 0.676224
[450]	valid_0's multi_logloss: 0.890888	valid_0's weighted-f1-score: 0.676503
[500]	valid_0's multi_logloss: 0.890752	valid_0's weighted-f1-score: 0.676508
Early stopping, best iteration is:
[485]	valid_0's multi_logloss: 0.890778	valid_0's weighted-f1-score: 0.676696
Training until validation scores don't improve for

### 总结：对profiles特征做SVD0.675