In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_absolute_error as MAE
import gc
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import seaborn as sns

%matplotlib inline
sns.set()

In [2]:
# load dataset

df_order = pd.read_csv("dataset/order.csv")
df_group = pd.read_csv("dataset/group.csv")
df_airline = pd.read_csv("dataset/airline.csv")
df_day_schedule = pd.read_csv("dataset/day_schedule.csv")
df_train = pd.read_csv("dataset/training-set.csv")
df_test = pd.read_csv("dataset/testing-set.csv")

In [3]:
# date Conversion

month = {'Jan': '01', 'Feb': '02' , 'Mar':'03' ,'Apr': '04',
'May': '05', 'Jun': '06' , 'Jul': '07' , 'Aug':'08',
'Sep':'09', 'Oct':'10' , 'Nov':'11', 'Dec':'12' }

def Convert_Date(x):
    Year='20'+x[-2:]
    Month=month[x[-6:-3]]
    Day=x[:-7]
    return pd.to_datetime(Year+'-'+Month+'-'+Day)


In [4]:
# group data

df_group['Begin_Date']=df_group.begin_date.apply(lambda x: Convert_Date(x))
df_group['SubLine']= df_group.sub_line.apply(lambda x: int(x[14:]))
df_group['Area']= df_group.area.apply(lambda x: int(x[11:]))
group_used_cols=['group_id','Begin_Date','days','Area','SubLine','price']
df_order_1 = df_order.merge(df_group[group_used_cols], on='group_id')


In [5]:
# for order data

df_order_1['Order_Date']=df_order_1.order_date.apply(lambda x: Convert_Date(x))
df_order_1['Source_1']= df_order_1.source_1.apply(lambda x: int(x[11:]))
df_order_1['Source_2']= df_order_1.source_2.apply(lambda x: int(x[11:]))
df_order_1['Unit']= df_order_1.unit.apply(lambda x: int(x[11:]))
df_order_1['Begin_Date']=pd.to_datetime(df_order_1['Begin_Date'])
df_order_1['Order_Date']=pd.to_datetime(df_order_1['Order_Date'])
df_order_1['PreDays']=(df_order_1['Begin_Date']-df_order_1['Order_Date']).dt.days
df_order_1['Begin_Date_Weekday']= df_order_1['Begin_Date'].dt.dayofweek
df_order_1['Order_Date_Weekday']= df_order_1['Order_Date'].dt.dayofweek
df_order_1['Return_Date_Weekday']= (df_order_1['Begin_Date'].dt.dayofweek+df_order_1['days'])%7

order_used_columns=['order_id', 'group_id','Order_Date', 'Source_1', 'Source_2', 'Unit',
'people_amount', 'Begin_Date', 'days', 'Area', 'SubLine', 'price',
'PreDays','Begin_Date_Weekday', 'Order_Date_Weekday', 'Return_Date_Weekday']

df_order_2=df_order_1[order_used_columns]

In [6]:
# train/test data

df_train_1=df_train.merge(df_order_2,on='order_id')
df_test_1=df_test.merge(df_order_2,on='order_id')

In [7]:
# First predict by lightgbm and by cv

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np
import gc


In [None]:
used_features=['deal_or_not', 'order_id', 'group_id', 'Order_Date', 'Source_1',
'Source_2', 'Unit', 'people_amount', 'Begin_Date', 'days', 'Area',
'SubLine', 'price', 'PreDays', 'Begin_Date_Weekday',
'Order_Date_Weekday', 'Return_Date_Weekday']

folds = StratifiedKFold(n_splits= 10, shuffle=True, random_state=666)

# Create arrays and dataframes to store results
oof_preds = np.zeros(df_train_1.shape[0])
sub_preds = np.zeros(df_test_1.shape[0])
feature_importance_df = pd.DataFrame()

feats = [f for f in df_train_1.columns if f not in ['order_id','deal_or_not','group_id','Order_Date','Begin_Date']]

print ('feats:' + str(len(feats)))

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train_1[feats], df_train_1['deal_or_not'])):
    train_x, train_y = df_train_1[feats].iloc[train_idx], df_train_1['deal_or_not'].iloc[train_idx]
    valid_x, valid_y = df_train_1[feats].iloc[valid_idx], df_train_1['deal_or_not'].iloc[valid_idx]

print("Train Index:",train_idx,",Val Index:",valid_idx)

In [None]:
params = {
'nthread': 32,
'boosting_type': 'dart',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 70,
'max_depth': 9,
'subsample': 1,
'feature_fraction': 0.9,
'colsample_bytree': 0.08,
'min_split_gain': 0.09,
'min_child_weight': 9.5,
#'reg_alpha': 1,
#'reg_lambda': 50,
'verbose': 1,
# parameters for dart
'drop_rate':0.7,
'skip_drop':0.7,
'max_drop':5,
'uniform_drop':False,
'xgboost_dart_mode':True,
'drop_seed':4
}

In [None]:
if n_fold >= 0:
    dtrain = lgb.Dataset(train_x, label=train_y)
    dval = lgb.Dataset(valid_x, label=valid_y, reference=dtrain)

    bst = lgb.train(
    params, dtrain, num_boost_round=10000,
    valid_sets=[dval], early_stopping_rounds=300, verbose_eval=100)

In [12]:

tmp_valid = bst.predict(valid_x, num_iteration=bst.best_iteration)

tmp_valid.dump('input/kfold_valid_' + str(n_fold) + '.pkl')

oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)

tmp = bst.predict(df_test_1[feats], num_iteration=bst.best_iteration)

tmp.dump('input/kfold_' + str(n_fold) + '.pkl')

sub_preds += bst.predict(df_test_1[feats], num_iteration=bst.best_iteration) / folds.n_splits

# Make the feature importance dataframe

gain = bst.feature_importance('gain')

fold_importance_df = pd.DataFrame({'feature':bst.feature_name(),
'split':bst.feature_importance('split'),
'gain':100*gain/gain.sum(),
'fold':n_fold,
}).sort_values('gain',ascending=False)

feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

del bst, train_x, train_y, valid_x, valid_y

gc.collect()


Fold 10 AUC : 0.679108


2506

In [15]:
app_test = pd.read_csv('dataset/testing-set.csv', usecols=['order_id'])
app_train = pd.read_csv("dataset/training-set.csv")

preds = pd.DataFrame({"order_id":app_test["order_id"], "deal_or_not":sub_preds})

# create output sub-folder

preds.to_csv("output/lgb_dart_" + str(roc_auc_score(app_train['deal_or_not'], oof_preds)) + ".csv", index=False)