In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [2]:
train_df = pd.read_csv("./train.csv")

In [3]:
log_df = pd.read_csv("./view_log.csv")

In [4]:
item_df = pd.read_csv("./item_data.csv")

In [5]:
test_df = pd.read_csv("./test.csv")

In [6]:
impr_id = test_df["impression_id"]

In [7]:
train_df.columns, log_df.columns, item_df.columns

(Index(['impression_id', 'impression_time', 'user_id', 'app_code', 'os_version',
        'is_4G', 'is_click'],
       dtype='object'),
 Index(['server_time', 'device_type', 'session_id', 'user_id', 'item_id'], dtype='object'),
 Index(['item_id', 'item_price', 'category_1', 'category_2', 'category_3',
        'product_type'],
       dtype='object'))

In [7]:
gb = log_df.groupby("user_id").size().reset_index().rename(columns={0:'count'})
def get_session_count(x):
    #print(x)
    _x = list(gb[gb["user_id"]==x]["count"])[0]
    return _x

In [8]:
train_labels = train_df["is_click"]
del train_df["is_click"]

In [9]:
merged_df = pd.concat([train_df, test_df], axis=0)
merged_df.shape, train_df.shape, test_df.shape

((328284, 6), (237609, 6), (90675, 6))

In [10]:
merged_df["impression_time"] = pd.to_datetime(merged_df["impression_time"])
merged_df["impression_day"] = merged_df["impression_time"].dt.day
merged_df["impression_month"] = merged_df["impression_time"].dt.month
# merged_df["impression_year"] = merged_df["impression_time"].dt.year
merged_df["impression_week"] = merged_df["impression_time"].dt.week
merged_df["impression_day_week"] = merged_df["impression_time"].dt.dayofweek
merged_df["impression_hour"] = merged_df["impression_time"].dt.hour
merged_df["impression_minute"] = merged_df["impression_time"].dt.minute

In [11]:
le = LabelEncoder()
merged_df["app_code"] = le.fit_transform(merged_df["app_code"])
merged_df["os_version"] = le.fit_transform(merged_df["os_version"])

In [12]:
%time
merged_df["session_count"] = merged_df["user_id"].apply(lambda x: get_session_count(x))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 8.11 µs


In [13]:
del merged_df["impression_id"]
del merged_df["impression_time"]
del merged_df["user_id"]

In [15]:
train_df = merged_df.iloc[:train_df.shape[0],:]
test_df = merged_df.iloc[train_df.shape[0]:,:]
train_df.shape, test_df.shape, train_labels.shape

((237609, 10), (90675, 10), (237609,))

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train_df, train_labels,
                                                    stratify=train_labels,
                                                    test_size=0.15,
                                                    random_state=1234)

In [None]:
X_train.columns

In [17]:
alg = xgb.XGBClassifier(learning_rate=0.1,
                        n_estimators=140,
                        max_depth=7,
                        min_child_weight=3,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=1.0,
                        objective='binary:logistic',
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)

In [18]:
cv_folds=10
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
# xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=50)
alg.set_params(n_estimators=cvresult.shape[0])
alg.fit(X_train, y_train,verbose=True, eval_metric='auc')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=0.2, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=3, missing=None,
       n_estimators=2, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.6)

In [19]:
val_pred = alg.predict_proba(X_test)[:, 1]
roc_auc_score(y_test,val_pred)

0.6829390531843915

In [20]:
alg = xgb.XGBClassifier(learning_rate=0.1,
                        n_estimators=140,
                        max_depth=7,
                        min_child_weight=3,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=1.0,
                        objective='binary:logistic',
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)

In [21]:
#Train on entire data
cv_folds=10
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(train_df.values, label=train_labels.values)
# xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=50)
alg.set_params(n_estimators=cvresult.shape[0])
alg.fit(train_df, train_labels,verbose=True, eval_metric='auc')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=0.2, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=3, missing=None,
       n_estimators=3, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.6)

In [22]:
pred = alg.predict_proba(test_df)[:,1]
pred.shape

(90675,)

In [25]:
impr_id.shape

(90675,)

In [26]:
sub_df = pd.DataFrame(list(zip(impr_id,pred)), columns=["impression_id", "is_click"])

In [27]:
sub_df.to_csv("sub_05.csv", index=False)