In [16]:
import os
import numpy as np
import pandas as pd
import gc
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from numpy.random import RandomState

In [2]:
train = pd.read_csv('./dataset/preprocess/train.csv')
test = pd.read_csv('./dataset/preprocess/test.csv')

In [3]:
train.shape

(201917, 1742)

In [4]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,authorized_flag&1&purchase_amount,authorized_flag&1&installments,city_id&19&purchase_amount,city_id&19&installments,...,category_4_var,category_4_skew,category_4_sum,city_id_nunique,merchant_category_id_nunique,merchant_id_nunique,state_id_nunique,subsector_id_nunique,card_id_size,card_id_count
0,67,C_ID_92a2005557,5,2,1,-0.820283,-170.641218,0.0,-1.422815,0.0,...,0.054623,-3.811953,261.0,9,46,118,3,21,283,283
1,62,C_ID_3d0044924f,4,1,0,0.392913,-213.239185,507.0,-4.782308,7.0,...,0.075036,-3.073118,327.0,9,58,148,3,24,356,356
2,57,C_ID_d639edf6cd,2,2,0,0.688056,-28.528749,0.0,-0.705405,0.0,...,0.065011,-3.54848,41.0,5,9,14,2,8,44,44
3,70,C_ID_186d6a6901,4,3,0,0.142495,-54.145736,89.0,-0.707839,1.0,...,0.023523,-6.36111,82.0,7,28,57,5,15,84,84
4,72,C_ID_cdbd2c0db2,1,3,0,-0.159749,-88.966702,179.0,0.0,0.0,...,0.091496,-2.668681,151.0,7,37,103,7,19,169,169


In [5]:
np.count_nonzero(train) / train.size

0.22607061885876684

In [7]:
features = train.columns.tolist()
features.remove('card_id')
features.remove('target')
featureSelect = features[:]

corr = []
for fea in featureSelect:
    corr.append(abs(train[[fea, 'target']].fillna(0).corr().values[0][1]))

se = pd.Series(corr, index=featureSelect).sort_values(ascending=False)
feature_select = ['card_id'] + se[:300].index.tolist()

train_RF = train[feature_select + ['target']]
test_RF = test[feature_select]

In [8]:
train_RF.shape

(201917, 302)

In [24]:
test_RF.shape

(123623, 301)

In [10]:
def feature_select_pearson(train, test):
    """
    Feature selection based on the pearson coefficient
    :param train:  training set
    :param test: test set
    :return: training and test sets after selection
    """
    print('feature_select...')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")
    featureSelect = features[:]

    # 去掉缺失值比例超过0.99的
    for fea in features:
        if train[fea].isnull().sum() / train.shape[0] >= 0.99:
            featureSelect.remove(fea)

    # 进行pearson相关性计算
    corr = []
    for fea in featureSelect:
        corr.append(abs(train[[fea, 'target']].fillna(0).corr().values[0][1]))

    # 取top300的特征进行建模，具体数量可选
    se = pd.Series(corr, index=featureSelect).sort_values(ascending=False)
    feature_select = ['card_id'] + se[:300].index.tolist()
    print('done')
    return train[feature_select + ['target']], test[feature_select]

In [14]:
def param_grid_search(train):
    """
    网格搜索参数调优
    :param train:训练集
    :return:网格搜索训练结果
    """

    print('param_grid_search')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")
    parameter_space = {
        "n_estimators": [81], 
        "min_samples_leaf": [31],
        "min_samples_split": [2],
        "max_depth": [10],
        "max_features": [80]
    }
    

    print("Tuning hyper-parameters for mse")

    clf = RandomForestRegressor(
        criterion="squared_error",
        n_jobs=15,
        random_state=22)

    grid = GridSearchCV(clf, parameter_space, cv=2, scoring="neg_mean_squared_error")
    grid.fit(train[features].values, train['target'].values)
    

    print("best_params_:")
    print(grid.best_params_)
    means = grid.cv_results_["mean_test_score"]
    stds = grid.cv_results_["std_test_score"]
    # 此处额外考虑观察交叉验证过程中不同超参数的
    for mean, std, params in zip(means, stds, grid.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    return grid

In [15]:
grid = param_grid_search(train_RF)

param_grid_search
Tuning hyper-parameters for mse
best_params_:
{'max_depth': 10, 'max_features': 80, 'min_samples_leaf': 31, 'min_samples_split': 2, 'n_estimators': 81}
-13.617 (+/-0.088) for {'max_depth': 10, 'max_features': 80, 'min_samples_leaf': 31, 'min_samples_split': 2, 'n_estimators': 81}


In [17]:
np.sqrt(-grid.best_score_)

3.690154811274698

# Cross validation

In [28]:
def train_predict(train, test, best_clf):
    """
    To train and predict
    :param train: training set
    :param test: test set
    :param best_clf: the best classifier
    :return:
    """

    # step 1: feature selection
    print('train_predict...')
    features = train.columns.tolist()
    features.remove("card_id")
    features.remove("target")

    # step 2: Cross Validation
    prediction_test = 0
    cv_score = []
    prediction_train = pd.Series()

    kf = KFold(n_splits=5, random_state=22, shuffle=True)
    for train_part_index, eval_index in kf.split(train[features], train['target']):
        # train the model on training set
        best_clf.fit(train[features].loc[train_part_index].values, train['target'].loc[train_part_index].values)
        # Add the prediction result
        prediction_test += best_clf.predict(test[features].values)
        # prediction on validation test
        eval_pre = best_clf.predict(train[features].loc[eval_index].values)
        # evaluate on validation prediction with MSE
        score = np.sqrt(mean_squared_error(train['target'].loc[eval_index].values, eval_pre))
        # Put MSE score into cv_score list
        cv_score.append(score)
        print(score)
        # Put prediction on validation set into prediction_train
        prediction_train = prediction_train.append(pd.Series(best_clf.predict(train[features].loc[eval_index]), index=eval_index))
    
    print(cv_score, sum(cv_score)/5)
    pd.Series(prediction_train.sort_index().values).to_csv("./dataset/preprocess/train_randomforest.csv", index=False)
    pd.Series(prediction_test / 5).to_csv("./dataset/preprocess/test_randomforest.csv", index=False)
    # Append the label 'target' to the test set
    test['target'] = prediction_test / 5
    test[['card_id', 'target']].to_csv("result/submission_randomforest_5kf.csv", index=False)
    return

In [29]:
train_predict(train_RF, test_RF, grid.best_estimator_)

train_predict...


  prediction_train = pd.Series()


3.675458048156077


  prediction_train = prediction_train.append(pd.Series(best_clf.predict(train[features].loc[eval_index]), index=eval_index))


3.7098960303168167


  prediction_train = prediction_train.append(pd.Series(best_clf.predict(train[features].loc[eval_index]), index=eval_index))


3.7175960057854875


  prediction_train = prediction_train.append(pd.Series(best_clf.predict(train[features].loc[eval_index]), index=eval_index))


3.682888749975916


  prediction_train = prediction_train.append(pd.Series(best_clf.predict(train[features].loc[eval_index]), index=eval_index))


3.646825949050688
[3.675458048156077, 3.7098960303168167, 3.7175960057854875, 3.682888749975916, 3.646825949050688] 3.686532956656997


  prediction_train = prediction_train.append(pd.Series(best_clf.predict(train[features].loc[eval_index]), index=eval_index))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['target'] = prediction_test / 5
