In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from datetime import datetime
from tqdm import tqdm

In [2]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [3]:
train_df = pd.read_csv("./train.csv")

In [4]:
log_df = pd.read_csv("./view_log.csv")

In [5]:
item_df = pd.read_csv("./item_data.csv")

In [6]:
test_df = pd.read_csv("./test.csv")

In [7]:
# log_item_df = pd.merge(log_df, item_df,on="item_id")

In [8]:
train_df.columns, log_df.columns, item_df.columns

(Index(['impression_id', 'impression_time', 'user_id', 'app_code', 'os_version',
        'is_4G', 'is_click'],
       dtype='object'),
 Index(['server_time', 'device_type', 'session_id', 'user_id', 'item_id'], dtype='object'),
 Index(['item_id', 'item_price', 'category_1', 'category_2', 'category_3',
        'product_type'],
       dtype='object'))

In [9]:
num_sessions = log_df.groupby("user_id").size().reset_index().rename(columns={0:'count'})
def get_session_count(x):
    #print(x)
    _x = list(num_sessions[num_sessions["user_id"]==x]["count"])[0]
    return _x

In [10]:
num_unique_sessions = log_df.groupby("user_id").session_id.nunique().reset_index().rename(columns={'session_id':'count'})
def get_num_unique_sessions(x):
    _x = list(num_unique_sessions[num_unique_sessions["user_id"]==x]['count'])[0]
    return _x

In [11]:
num_unique_item = log_df.groupby("user_id").item_id.nunique().reset_index().rename(columns={'item_id':'count'})
def get_num_unique_item(x):
    _x = list(num_unique_sessions[num_unique_sessions["user_id"]==x]['count'])[0]
    return _x

In [12]:
merged_df = pd.concat([train_df, test_df], axis=0)
merged_df.shape, train_df.shape, test_df.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


((328284, 7), (237609, 7), (90675, 6))

In [13]:
impr_id = test_df["impression_id"]

In [14]:
train_labels = train_df["is_click"]


In [15]:
merged_df["impression_time"] = pd.to_datetime(merged_df["impression_time"])
merged_df["impression_day"] = merged_df["impression_time"].dt.day
merged_df["impression_month"] = merged_df["impression_time"].dt.month
# merged_df["impression_year"] = merged_df["impression_time"].dt.year
merged_df["impression_week"] = merged_df["impression_time"].dt.week
merged_df["impression_day_week"] = merged_df["impression_time"].dt.dayofweek
merged_df["impression_hour"] = merged_df["impression_time"].dt.hour
merged_df["impression_minute"] = merged_df["impression_time"].dt.minute

In [16]:
le = LabelEncoder()
merged_df["app_code"] = le.fit_transform(merged_df["app_code"])

In [17]:
# merged_df["os_version"] = le.fit_transform(merged_df["os_version"])


In [18]:
merged_df["session_count"] = merged_df["user_id"].apply(lambda x: get_session_count(x))

In [19]:
merged_df["unique_session"] = merged_df["user_id"].apply(lambda x: get_num_unique_sessions(x))

In [20]:
merged_df["unique_item"] = merged_df["user_id"].apply(lambda x: get_num_unique_item(x))

In [21]:
merged_df.columns, merged_df.shape

(Index(['app_code', 'impression_id', 'impression_time', 'is_4G', 'is_click',
        'user_id', 'impression_day', 'impression_month', 'impression_week',
        'impression_day_week', 'impression_hour', 'impression_minute',
        'session_count', 'unique_session', 'unique_item'],
       dtype='object'), (328284, 15))

In [22]:
merged_df.to_csv("./merged_step_1.csv", index=False)

In [113]:
merged_df = pd.read_csv("./merged_step_1.csv")
merged_df.columns

Index(['app_code', 'impression_id', 'impression_time', 'is_4G', 'is_click',
       'user_id', 'impression_day', 'impression_month', 'impression_week',
       'impression_day_week', 'impression_hour', 'impression_minute',
       'session_count', 'unique_session', 'unique_item'],
      dtype='object')

In [114]:
train_df_shape = (237609,7)

In [115]:
train_df = merged_df.iloc[:train_df_shape[0],:]
test_df = merged_df.iloc[train_df_shape[0]:,:]
merged_df.shape, train_df.shape, test_df.shape

((328284, 15), (237609, 15), (90675, 15))

In [116]:
df1 = pd.merge(train_df,log_df, on="user_id")
df1.shape

(11052882, 19)

In [117]:
df1["impression_date"] = pd.to_datetime(df1["impression_time"]).dt.date
df1["server_date"] = pd.to_datetime(df1["server_time"]).dt.date

In [118]:
df1["diff"] = (df1["impression_date"]-df1["server_date"]).dt.days
df1 = df1[df1["diff"]>0]

In [119]:
df1 = df1.loc[df1.groupby(["impression_time", "user_id"])["diff"].idxmin()]

In [120]:
df1.reset_indext_indext_index(inplace=True, drop=True)

In [121]:
train_df.shape, df1.shape

((237609, 15), (231477, 22))

In [122]:
df2 = pd.merge(test_df,log_df, on="user_id")

In [123]:
df2["impression_date"] = pd.to_datetime(df2["impression_time"]).dt.date
df2["server_date"] = pd.to_datetime(df2["server_time"]).dt.date

In [124]:
df2["diff"] = (df2["impression_date"] - df2["server_date"]).dt.days
df2 = df2[df2["diff"]>0]

In [125]:
df2 = df2.loc[df2.groupby(["impression_time", "user_id"])["diff"].idxmin()]

In [126]:
df2.reset_index(inplace=True, drop=True)

In [127]:
test_df.shape, df2.shape

((90675, 15), (90373, 22))

In [128]:
id_not_present_in_test = list(set(list(test_df.impression_id)) - set(list(df2.impression_id)))

In [129]:
# item_price = log_item_df.groupby("user_id").item_price.mean().reset_index()
len(id_not_present_in_test)

302

In [130]:
merged_step_2_df = pd.concat([df1, df2], axis=0)
merged_step_2_df.shape, df1.shape, df2.shape

((321850, 22), (231477, 22), (90373, 22))

In [131]:
merged_step_2_df.columns

Index(['app_code', 'impression_id', 'impression_time', 'is_4G', 'is_click',
       'user_id', 'impression_day', 'impression_month', 'impression_week',
       'impression_day_week', 'impression_hour', 'impression_minute',
       'session_count', 'unique_session', 'unique_item', 'server_time',
       'device_type', 'session_id', 'item_id', 'impression_date',
       'server_date', 'diff'],
      dtype='object')

In [132]:
merged_step_2_df = pd.merge(merged_step_2_df, item_df, on="item_id", how="left")

In [133]:
set(list(merged_step_2_df.item_id.values)) - set(list(item_df.item_id.values))

{8236,
 13054,
 19919,
 26822,
 35796,
 44868,
 62896,
 63773,
 72294,
 79532,
 91436,
 103132,
 105071,
 114373,
 122449,
 127479}

In [134]:
merged_step_2_df.shape, df1.shape, df2.shape

((321850, 27), (231477, 22), (90373, 22))

In [135]:
# merged_step_2_df.isnull().any()

In [136]:
merged_step_2_df.item_price.fillna(2944,inplace=True)
merged_step_2_df.category_1.fillna(9,inplace=True)
merged_step_2_df.category_2.fillna(38,inplace=True)
merged_step_2_df.category_3.fillna(62,inplace=True)
merged_step_2_df.product_type.fillna(2874,inplace=True)

In [137]:
train_df.shape, train_df.columns, df1.shape, df1.columns

((237609, 15),
 Index(['app_code', 'impression_id', 'impression_time', 'is_4G', 'is_click',
        'user_id', 'impression_day', 'impression_month', 'impression_week',
        'impression_day_week', 'impression_hour', 'impression_minute',
        'session_count', 'unique_session', 'unique_item'],
       dtype='object'),
 (231477, 22),
 Index(['app_code', 'impression_id', 'impression_time', 'is_4G', 'is_click',
        'user_id', 'impression_day', 'impression_month', 'impression_week',
        'impression_day_week', 'impression_hour', 'impression_minute',
        'session_count', 'unique_session', 'unique_item', 'server_time',
        'device_type', 'session_id', 'item_id', 'impression_date',
        'server_date', 'diff'],
       dtype='object'))

In [138]:
temp_df = pd.merge(df1, train_df, on="impression_id", how="left")
_train_labels = temp_df["is_click_x"]

In [139]:
merged_step_2_df.columns

Index(['app_code', 'impression_id', 'impression_time', 'is_4G', 'is_click',
       'user_id', 'impression_day', 'impression_month', 'impression_week',
       'impression_day_week', 'impression_hour', 'impression_minute',
       'session_count', 'unique_session', 'unique_item', 'server_time',
       'device_type', 'session_id', 'item_id', 'impression_date',
       'server_date', 'diff', 'item_price', 'category_1', 'category_2',
       'category_3', 'product_type'],
      dtype='object')

In [140]:
del merged_step_2_df["impression_id"]
del merged_step_2_df["impression_time"]
del merged_step_2_df["user_id"]
del merged_step_2_df["session_id"]
del merged_step_2_df["impression_date"]
del merged_step_2_df["server_date"]
# del merged_step_2_df["diff"]
del merged_step_2_df["server_time"]
del merged_step_2_df["device_type"]
del merged_step_2_df["is_click"]
# del merged_step_2_df["os_version"]

In [141]:
del merged_step_2_df["item_id"]

In [200]:
labels_to_encode = ["category_1", "category_2", "category_3", "product_type"]
for l in labels_to_encode:
    le = LabelEncoder()
    merged_step_2_df[l] = le.fit_transform(merged_step_2_df[l])

In [201]:
merged_step_2_df.reset_index(drop=True,inplace=True)

In [202]:
_df1 = merged_step_2_df.iloc[:df1.shape[0],:]
_df2 = merged_step_2_df.iloc[df1.shape[0]:,:]
merged_step_2_df.shape, _df1.shape, _df2.shape, _train_labels.shape

((321850, 16), (231477, 16), (90373, 16), (231477,))

In [167]:
_df1.tail()

Unnamed: 0,app_code,is_4G,impression_day,impression_month,impression_week,impression_day_week,impression_hour,impression_minute,session_count,unique_session,unique_item,item_price,category_1,category_2,category_3,product_type
231472,207,1,13,12,50,3,7,40,134,18,18,6400.0,14.0,61.0,132.0,9842.0
231473,207,0,13,12,50,3,7,41,94,24,24,21696.0,1.0,7.0,308.0,4986.0
231474,289,0,13,12,50,3,7,42,1,1,1,844.0,11.0,70.0,322.0,4129.0
231475,335,1,13,12,50,3,7,43,84,24,24,18115.0,8.0,40.0,84.0,5036.0
231476,384,0,13,12,50,3,7,43,120,37,37,3955.0,4.0,38.0,62.0,8121.0


In [166]:
_train_labels.tail()

231472    0.0
231473    0.0
231474    0.0
231475    0.0
231476    1.0
Name: is_click_x, dtype: float64

In [146]:
# X_train, X_test, y_train, y_test = train_test_split(df1, _train_labels,
#                                                     stratify=_train_labels,
#                                                     test_size=0.10,
#                                                     random_state=1234)

In [147]:
# X_train.columns

In [203]:
params = {
        'learning_rate':[0.02, 0.05, 0.07, 0.1],
        'min_child_weight': [3, 5, 7, 10],
        'gamma': [0.2, 0.5, 1, 1.5, 2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 7]
        }

In [204]:
# alg = xgb.XGBClassifier(learning_rate=0.1,
#                         n_estimators=140,
#                         max_depth=7,
#                         min_child_weight=3,
#                         gamma=0.2,
#                         subsample=0.6,
#                         colsample_bytree=1.0,
#                         objective='binary:logistic',
#                         nthread=4,
#                         scale_pos_weight=1,
#                         seed=27)
clf = xgb.XGBClassifier(n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=2)

In [205]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=param_comb,
                                   scoring='roc_auc', n_jobs=4, cv=skf.split(_df1,_train_labels),
                                   verbose=3, random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(_df1, _train_labels)
timer(start_time)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:  9.6min finished



 Time taken: 0 hours 10 minutes and 41.3 seconds.


In [206]:
print(random_search.best_params_)

{'subsample': 0.8, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.5, 'colsample_bytree': 0.8}


In [207]:
# print(random_search.cv_results_)

In [208]:
# cv_folds=10
# xgb_param = alg.get_xgb_params()
# xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
# # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
# cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
#                           early_stopping_rounds=50)
# alg.set_params(n_estimators=cvresult.shape[0])
# alg.fit(X_train, y_train,verbose=True, eval_metric='auc')

In [209]:
# val_pred = alg.predict_proba(X_test)[:, 1]
# roc_auc_score(y_test,val_pred)

In [210]:
tuned_clf = xgb.XGBClassifier(learning_rate=0.05,
                        n_estimators=600,
                        max_depth=4,
                        min_child_weight=3,
                        gamma=0.5,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        objective='binary:logistic',
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)

In [211]:
#Train on entire data
# cv_folds=10
# xgb_param = alg.get_xgb_params()
# xgtrain = xgb.DMatrix(train_df.values, label=train_labels.values)
# # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
# cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
#                           early_stopping_rounds=50)
# alg.set_params(n_estimators=cvresult.shape[0])
tuned_clf.fit(_df1, _train_labels,verbose=True, eval_metric='auc')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.5, learning_rate=0.05,
       max_delta_step=0, max_depth=4, min_child_weight=3, missing=None,
       n_estimators=600, n_jobs=1, nthread=4, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8)

In [212]:
pred = tuned_clf.predict_proba(_df2)[:,1]
pred.shape

(90373,)

In [213]:
pred.mean()

0.04552915

In [214]:
df2.shape

(90373, 22)

In [215]:
impr_id.shape

(90675,)

In [216]:
sub_df = pd.DataFrame(list(zip(df2.impression_id,pred)), columns=["impression_id", "is_click"])

In [217]:
df_with_missing_impr_id = pd.DataFrame(id_not_present_in_test, columns=["impression_id"])
df_with_missing_impr_id["is_click"] = 0.04552915

In [218]:
df_with_missing_impr_id.shape

(302, 2)

In [219]:
_sub_df = sub_df.append(df_with_missing_impr_id)

In [220]:
_sub_df.shape

(90675, 2)

In [221]:
_sub_df.to_csv("sub_15.csv", index=False)