In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier

In [2]:
view_df = pd.read_csv("../view_log.csv", parse_dates=["server_time"])
item_df = pd.read_csv("../item_data.csv")
train = pd.read_csv('../train.csv', parse_dates=['impression_time'])
test = pd.read_csv('../test.csv', parse_dates=['impression_time'])

merged_df = pd.merge(view_df, item_df, on='item_id', how='inner')

In [3]:
print("Number of UUs: %s", (merged_df['user_id'].nunique()))

('Number of UUs: %s', 89152)


In [4]:
merged_df.drop_duplicates(inplace=True)

In [5]:
merged_df["hour_of_day"] = merged_df["server_time"].dt.hour
merged_df["day_of_week"] = merged_df["server_time"].dt.dayofweek
merged_df["day_of_year"] = merged_df["server_time"].dt.dayofyear
merged_df["day_of_month"] = merged_df["server_time"].dt.day

In [6]:
hist_df = merged_df.groupby("user_id").agg({"session_id": "count", "item_id": "nunique", 
                                            "hour_of_day": ["mean", "std"], "day_of_week":["mean", "std"],
                                            "day_of_year": ["min", "max", "mean", "std"], "day_of_month": "mean",
                                            "item_price": ["min", "max", "mean", "std"]})

hist_df.columns = ["item_price_min", "item_price_max", "item_price_mean", "item_price_std", "total_sessions",
                    "dom_mean", "dow_mean", "dow_std", "doy_min", "doy_max", "doy_mean", "doy_std",
                    "hour_mean", "hour_std", "num_uniq_items"]
                   
hist_df = hist_df.reset_index(drop=False)
hist_df.head()

Unnamed: 0,user_id,item_price_min,item_price_max,item_price_mean,item_price_std,total_sessions,dom_mean,dow_mean,dow_std,doy_min,doy_max,doy_mean,doy_std,hour_mean,hour_std,num_uniq_items
0,0,332,92160,7905.837838,17958.103557,37,11.918919,4.0,2.054805,291,341,317.351351,15.468994,11.027027,2.743543,18
1,1,383,12595,3946.75,4412.414232,8,7.875,5.25,2.12132,308,343,334.375,15.972633,2.875,3.482097,8
2,2,128,281536,14289.184049,31436.352164,163,17.159509,2.705521,2.13707,289,343,309.257669,17.381198,17.343558,4.569599,130
3,3,537,16640,7257.375,7805.928863,8,19.0,4.0,0.0,292,292,292.0,0.0,15.625,0.517549,3
4,4,1977,58252,30114.5,39792.434111,2,7.0,4.0,0.0,341,341,341.0,0.0,23.0,0.0,2


In [7]:
merged_df.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id,item_price,category_1,category_2,category_3,product_type,hour_of_day,day_of_week,day_of_year,day_of_month
0,2018-10-15 08:58:00,android,112333,4557,32970,54685,16,56,253,3184,8,0,288,15
1,2018-10-15 09:36:00,android,783457,88320,32970,54685,16,56,253,3184,9,0,288,15
2,2018-10-15 10:59:00,android,6902,1711,32970,54685,16,56,253,3184,10,0,288,15
3,2018-10-15 11:31:00,android,61138,58906,32970,54685,16,56,253,3184,11,0,288,15
4,2018-10-15 12:03:00,android,441653,64221,32970,54685,16,56,253,3184,12,0,288,15


In [8]:
cat = merged_df.groupby(["user_id", "category_1"]).size().unstack().fillna(0).reset_index()
cat.columns = ['user_id'] + ['cat_1_' + str(i) for i in range(merged_df.category_1.nunique())]
cat.head()

Unnamed: 0,user_id,cat_1_0,cat_1_1,cat_1_2,cat_1_3,cat_1_4,cat_1_5,cat_1_6,cat_1_7,cat_1_8,cat_1_9,cat_1_10,cat_1_11,cat_1_12,cat_1_13,cat_1_14,cat_1_15,cat_1_16
0,0,0.0,6.0,0.0,1.0,0.0,5.0,7.0,0.0,0.0,0.0,0.0,0.0,2.0,7.0,1.0,1.0,7.0
1,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0
2,2,1.0,31.0,0.0,6.0,0.0,1.0,10.0,8.0,19.0,8.0,11.0,11.0,8.0,11.0,4.0,8.0,26.0
3,3,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
cat2 = merged_df.groupby(["user_id", "category_2"]).size().unstack().fillna(0).reset_index()
cat2.columns = ['user_id'] + ['cat_2_' + str(i) for i in range(merged_df.category_2.nunique())]
cat2.head()

Unnamed: 0,user_id,cat_2_0,cat_2_1,cat_2_2,cat_2_3,cat_2_4,cat_2_5,cat_2_6,cat_2_7,cat_2_8,...,cat_2_69,cat_2_70,cat_2_71,cat_2_72,cat_2_73,cat_2_74,cat_2_75,cat_2_76,cat_2_77,cat_2_78
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
2,2,0.0,1.0,0.0,4.0,3.0,0.0,0.0,3.0,8.0,...,2.0,0.0,0.0,0.0,3.0,0.0,3.0,1.0,2.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
cat3 = merged_df.groupby(["user_id", "category_3"]).size().unstack().fillna(0).reset_index()
cat3.columns = ['user_id'] + ['cat_3_' + str(i) for i in range(merged_df.category_3.nunique())]
cat3.head()

Unnamed: 0,user_id,cat_3_0,cat_3_1,cat_3_2,cat_3_3,cat_3_4,cat_3_5,cat_3_6,cat_3_7,cat_3_8,...,cat_3_325,cat_3_326,cat_3_327,cat_3_328,cat_3_329,cat_3_330,cat_3_331,cat_3_332,cat_3_333,cat_3_334
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
hod = merged_df.groupby(["user_id", "hour_of_day"]).size().unstack().fillna(0).reset_index()
hod.columns = ['user_id'] + ['hr_' + str(i) for i in range(merged_df.hour_of_day .nunique())]
hod.head()

Unnamed: 0,user_id,hr_0,hr_1,hr_2,hr_3,hr_4,hr_5,hr_6,hr_7,hr_8,...,hr_14,hr_15,hr_16,hr_17,hr_18,hr_19,hr_20,hr_21,hr_22,hr_23
0,0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,...,5.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,17.0,4.0,2.0,0.0,1.0,14.0,31.0,8.0,43.0,1.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [12]:
dow = merged_df.groupby(["user_id", "day_of_week"]).size().unstack().fillna(0).reset_index()
dow.columns = ['user_id'] + ['dow_' + str(i) for i in range(merged_df.day_of_week .nunique())]
dow.head()

Unnamed: 0,user_id,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,0,2.0,2.0,7.0,6.0,3.0,0.0,17.0
1,1,1.0,0.0,0.0,0.0,0.0,0.0,7.0
2,2,30.0,26.0,29.0,31.0,9.0,0.0,38.0
3,3,0.0,0.0,0.0,0.0,8.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [13]:
user_timespent = merged_df.groupby(["user_id", "server_time"]).size()
user_timespent.name = "user_time_counts"
user_timespent = user_timespent.reset_index()
user_timespent = user_timespent.groupby("user_id").agg({"user_time_counts": ["mean", "std", "max", "median", "skew", "rank"]})
user_timespent.columns = ["user_time_counts_mean", "user_time_counts_std", "user_time_counts_max",
                     "user_time_counts_median", "user_time_counts_skew", "user_time_counts_rank"]
user_timespent = user_timespent.reset_index()
user_timespent.fillna(0, inplace=True)
user_timespent.head()

Unnamed: 0,index,user_time_counts_mean,user_time_counts_std,user_time_counts_max,user_time_counts_median,user_time_counts_skew,user_time_counts_rank
0,0,1.0,0.0,1.0,1.0,0.0,19.0
1,1,1.142857,0.377964,2.0,1.0,2.645751,19.0
2,2,1.13986,0.386414,3.0,1.0,2.811489,19.0
3,3,1.333333,0.516398,2.0,1.0,0.968246,19.0
4,4,1.0,0.0,1.0,1.0,0.0,19.0


In [14]:
user_session = merged_df.groupby(["user_id", "session_id"]).size()
user_session.name = "user_session_counts"
user_session = user_session.reset_index()
user_session = user_session.groupby("user_id").agg({"user_session_counts": ["mean", "std", "max", "median", "skew", "rank"]})
user_session.columns = ["user_session_counts_mean", "user_session_counts_std", "user_session_counts_max",
                     "user_session_counts_median", "user_session_counts_skew", "user_session_counts_rank"]
user_session = user_session.reset_index()
user_session.fillna(0, inplace=True)
user_session.head()

Unnamed: 0,index,user_session_counts_mean,user_session_counts_std,user_session_counts_max,user_session_counts_median,user_session_counts_skew,user_session_counts_rank
0,0,3.363636,4.717472,17.0,2.0,2.871689,3.0
1,1,2.666667,2.886751,6.0,1.0,1.732051,11.0
2,2,4.405405,3.825786,16.0,3.0,1.567816,7.0
3,3,8.0,0.0,8.0,8.0,0.0,3.0
4,4,2.0,0.0,2.0,2.0,0.0,9.0


In [15]:
user_timespent.rename(columns={'index':'user_id'}, inplace=True)
user_session.rename(columns={'index':'user_id'}, inplace=True)

In [16]:
train["hour_of_day"] = train["impression_time"].dt.hour
train["day_of_week"] = train["impression_time"].dt.dayofweek
train["day_of_year"] = train["impression_time"].dt.dayofyear
train["day_of_month"] = train["impression_time"].dt.day

test["hour_of_day"] = test["impression_time"].dt.hour
test["day_of_week"] = test["impression_time"].dt.dayofweek
test["day_of_year"] = test["impression_time"].dt.dayofyear
test["day_of_month"] = test["impression_time"].dt.day

In [17]:
train.sort_values(by=['user_id', 'impression_time'], inplace=True)
train['Nth_impression'] = train.groupby('user_id').cumcount() + 1
train['time_since_last_ad'] = train.groupby('user_id')['impression_time'].diff()
train['time_since_last_ad'] = train['time_since_last_ad'].dt.total_seconds()

test.sort_values(by=['user_id', 'impression_time'], inplace=True)
test['Nth_impression'] = test.groupby('user_id').cumcount() + 1
test['time_since_last_ad'] = test.groupby('user_id')['impression_time'].diff()
test['time_since_last_ad'] = test['time_since_last_ad'].dt.total_seconds()

In [18]:
train_df = pd.merge(train, hist_df, how='left', on='user_id')
test_df = pd.merge(test, hist_df, how='left', on='user_id')

train_df = pd.merge(train_df, cat, how='left', on='user_id')
test_df = pd.merge(test_df, cat, how='left', on='user_id')

train_df = pd.merge(train_df, cat2, how='left', on='user_id')
test_df = pd.merge(test_df, cat2, how='left', on='user_id')

train_df = pd.merge(train_df, cat3, how='left', on='user_id')
test_df = pd.merge(test_df, cat3, how='left', on='user_id')

train_df = pd.merge(train_df, user_timespent, how='left', on='user_id')
test_df = pd.merge(test_df, user_timespent, how='left', on='user_id')

train_df = pd.merge(train_df, user_session, how='left', on='user_id')
test_df = pd.merge(test_df, user_session, how='left', on='user_id')

train_df = pd.merge(train_df, hod, how='left', on='user_id')
test_df = pd.merge(test_df, hod, how='left', on='user_id')

train_df = pd.merge(train_df, dow, how='left', on='user_id')
test_df = pd.merge(test_df, dow, how='left', on='user_id')

In [19]:
train_df.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,hour_of_day,day_of_week,day_of_year,...,hr_21,hr_22,hr_23,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,5f98c2c31a8006e510448c02ec74d50f,2018-11-26 23:30:00,0,207,old,0,0,23,0,330,...,0.0,0.0,0.0,2.0,2.0,7.0,6.0,3.0,0.0,17.0
1,2b12c0d47f5821a5adb3bfd973d0f708,2018-11-19 20:49:00,2,190,intermediate,0,0,20,0,323,...,8.0,43.0,1.0,30.0,26.0,29.0,31.0,9.0,0.0,38.0
2,f85e4bf0e34f3ee66add56229845e4db,2018-11-20 20:29:00,2,190,intermediate,0,0,20,1,324,...,8.0,43.0,1.0,30.0,26.0,29.0,31.0,9.0,0.0,38.0
3,010ed37e44e2fdc175b4c5c6c930805a,2018-11-20 20:53:00,2,190,intermediate,0,0,20,1,324,...,8.0,43.0,1.0,30.0,26.0,29.0,31.0,9.0,0.0,38.0
4,577c63f9937fa0e8d4650ddf1510a03f,2018-11-21 21:47:00,2,190,intermediate,0,0,21,2,325,...,8.0,43.0,1.0,30.0,26.0,29.0,31.0,9.0,0.0,38.0


In [20]:
feats = [c for c in train_df.columns if c not in ['impression_id', 'is_click', 'impression_time']]

In [21]:
feats

['user_id',
 'app_code',
 'os_version',
 'is_4G',
 'hour_of_day',
 'day_of_week',
 'day_of_year',
 'day_of_month',
 'Nth_impression',
 'time_since_last_ad',
 'item_price_min',
 'item_price_max',
 'item_price_mean',
 'item_price_std',
 'total_sessions',
 'dom_mean',
 'dow_mean',
 'dow_std',
 'doy_min',
 'doy_max',
 'doy_mean',
 'doy_std',
 'hour_mean',
 'hour_std',
 'num_uniq_items',
 'cat_1_0',
 'cat_1_1',
 'cat_1_2',
 'cat_1_3',
 'cat_1_4',
 'cat_1_5',
 'cat_1_6',
 'cat_1_7',
 'cat_1_8',
 'cat_1_9',
 'cat_1_10',
 'cat_1_11',
 'cat_1_12',
 'cat_1_13',
 'cat_1_14',
 'cat_1_15',
 'cat_1_16',
 'cat_2_0',
 'cat_2_1',
 'cat_2_2',
 'cat_2_3',
 'cat_2_4',
 'cat_2_5',
 'cat_2_6',
 'cat_2_7',
 'cat_2_8',
 'cat_2_9',
 'cat_2_10',
 'cat_2_11',
 'cat_2_12',
 'cat_2_13',
 'cat_2_14',
 'cat_2_15',
 'cat_2_16',
 'cat_2_17',
 'cat_2_18',
 'cat_2_19',
 'cat_2_20',
 'cat_2_21',
 'cat_2_22',
 'cat_2_23',
 'cat_2_24',
 'cat_2_25',
 'cat_2_26',
 'cat_2_27',
 'cat_2_28',
 'cat_2_29',
 'cat_2_30',
 'cat_2_31

In [22]:
train_df.fillna(0, inplace=True)
train_df.replace(np.inf, 0, inplace=True)
train_df.replace(-np.inf, 0, inplace=True)

test_df.fillna(0, inplace=True)
test_df.replace(np.inf, 0, inplace=True)
test_df.replace(-np.inf, 0, inplace=True)

In [23]:
train_df['is_test'] = False
test_df['is_test'] = True
full_df = train_df.append(test_df)
full_df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Nth_impression,app_code,cat_1_0,cat_1_1,cat_1_10,cat_1_11,cat_1_12,cat_1_13,cat_1_14,cat_1_15,...,user_session_counts_median,user_session_counts_rank,user_session_counts_skew,user_session_counts_std,user_time_counts_max,user_time_counts_mean,user_time_counts_median,user_time_counts_rank,user_time_counts_skew,user_time_counts_std
0,1,207,0.0,6.0,0.0,0.0,2.0,7.0,1.0,1.0,...,2.0,3.0,2.871689,4.717472,1.0,1.0,1.0,19.0,0.0,0.0
1,1,190,1.0,31.0,11.0,11.0,8.0,11.0,4.0,8.0,...,3.0,7.0,1.567816,3.825786,3.0,1.13986,1.0,19.0,2.811489,0.386414
2,2,190,1.0,31.0,11.0,11.0,8.0,11.0,4.0,8.0,...,3.0,7.0,1.567816,3.825786,3.0,1.13986,1.0,19.0,2.811489,0.386414
3,3,190,1.0,31.0,11.0,11.0,8.0,11.0,4.0,8.0,...,3.0,7.0,1.567816,3.825786,3.0,1.13986,1.0,19.0,2.811489,0.386414
4,4,190,1.0,31.0,11.0,11.0,8.0,11.0,4.0,8.0,...,3.0,7.0,1.567816,3.825786,3.0,1.13986,1.0,19.0,2.811489,0.386414


In [24]:
from scipy import stats

#http://www.jtrive.com/determining-histogram-bin-width-using-the-freedman-diaconis-rule.html
def freedman_diaconis(data, returnas="bins"):
    """
    Use Freedman Diaconis rule to compute optimal histogram bin width. 
    ``returnas`` can be one of "width" or "bins", indicating whether
    the bin width or number of bins should be returned respectively. 


    Parameters
    ----------
    data: np.ndarray
        One-dimensional array.

    returnas: {"width", "bins"}
        If "width", return the estimated width for each histogram bin. 
        If "bins", return the number of bins suggested by rule.
    """
    data = np.asarray(data, dtype=np.float64)
    IQR  = stats.iqr(data, rng=(25, 75), scale="raw", nan_policy="omit")
    N    = data.size
    bw   = (2 * IQR) / np.power(N, 1/3)

    if returnas=="width":
        result = bw
    else:
        datmin, datmax = data.min(), data.max()
        datrng = datmax - datmin
        result = int((datrng / bw) + 1)
    return result

In [25]:
cat_dict = {}
for idx, feat in enumerate(feats):
    try:
        if feat.startswith('cat') or feat.startswith('hr') or feat in ['user_time_counts_median'] or idx <=7:
            continue
        num_bins = freedman_diaconis(full_df[feat].values)
        cat_dict[feat] = num_bins
    except Exception as e:
        print feat, e
        cat_dict[feat] = 2

In [26]:
cat_dict

{'Nth_impression': 8,
 'dom_mean': 3,
 'dow_0': 29,
 'dow_1': 29,
 'dow_2': 26,
 'dow_3': 34,
 'dow_4': 26,
 'dow_5': 25,
 'dow_6': 23,
 'dow_mean': 3,
 'dow_std': 3,
 'doy_max': 16,
 'doy_mean': 13,
 'doy_min': 8,
 'doy_std': 3,
 'hour_mean': 3,
 'hour_std': 3,
 'item_price_max': 7,
 'item_price_mean': 31,
 'item_price_min': 368,
 'item_price_std': 10,
 'num_uniq_items': 12,
 'time_since_last_ad': 14,
 'total_sessions': 30,
 'user_session_counts_max': 11,
 'user_session_counts_mean': 23,
 'user_session_counts_median': 37,
 'user_session_counts_rank': 7,
 'user_session_counts_skew': 4,
 'user_session_counts_std': 9,
 'user_time_counts_max': 8,
 'user_time_counts_mean': 17,
 'user_time_counts_rank': 6,
 'user_time_counts_skew': 3,
 'user_time_counts_std': 6}

In [27]:
for k,v in cat_dict.iteritems():
    full_df[k], bins = pd.cut(full_df[k], v, retbins=True)

In [28]:
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]

In [29]:
cat_keys = full_df[feats].dtypes[full_df[feats].dtypes != 'int64'][full_df[feats].dtypes != 'float64'].reset_index()['index'].tolist()

In [30]:
cat_keys

['os_version',
 'Nth_impression',
 'time_since_last_ad',
 'item_price_min',
 'item_price_max',
 'item_price_mean',
 'item_price_std',
 'total_sessions',
 'dom_mean',
 'dow_mean',
 'dow_std',
 'doy_min',
 'doy_max',
 'doy_mean',
 'doy_std',
 'hour_mean',
 'hour_std',
 'num_uniq_items',
 'user_time_counts_mean',
 'user_time_counts_std',
 'user_time_counts_max',
 'user_time_counts_skew',
 'user_time_counts_rank',
 'user_session_counts_mean',
 'user_session_counts_std',
 'user_session_counts_max',
 'user_session_counts_median',
 'user_session_counts_skew',
 'user_session_counts_rank',
 'dow_0',
 'dow_1',
 'dow_2',
 'dow_3',
 'dow_4',
 'dow_5',
 'dow_6']

In [31]:
cat_feats = [0,1,3,4,5,6,7] + list(column_index(train_df[feats], cat_keys))

In [32]:
sorted(cat_feats)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 456,
 457,
 458,
 460,
 461,
 462,
 463,
 464,
 465,
 466,
 467,
 492,
 493,
 494,
 495,
 496,
 497,
 498]

In [33]:
from sklearn.preprocessing import LabelEncoder
for feat in cat_keys:
    le = LabelEncoder()
    le.fit(full_df[feat])
    full_df[feat] = le.transform(full_df[feat])

In [34]:
train_df = full_df.loc[full_df['is_test'] == False]
test_df = full_df.loc[full_df['is_test'] == True]

In [35]:
# mask = (train_df['impression_time'] > '2018-12-09')
# dtrain = train_df.loc[~mask]
# val = train_df.loc[mask]
target = 'is_click'

In [36]:
# y_tr = dtrain[target].values
# y_val = val[target].values
# X_tr, X_val = dtrain[feats], val[feats]

In [37]:
cb_model = CatBoostClassifier(iterations=604,
                             learning_rate=0.02,
                             depth=10,
                             eval_metric='AUC',
                             random_seed = 13,
                             bagging_temperature = 0.3,
                             od_type='Iter',
                             metric_period = 10,
                             od_wait=604,
                             l2_leaf_reg=13)

In [38]:
cb_model.fit(train_df[feats], train_df[target].values,
             eval_set=(train_df[feats], train_df[target].values),
             cat_features=cat_feats,
             verbose=True)



0:	test: 0.5278416	best: 0.5278416 (0)	total: 5.36s	remaining: 53m 52s
10:	test: 0.5588622	best: 0.5590214 (8)	total: 42.2s	remaining: 37m 54s
20:	test: 0.6605452	best: 0.6610830 (17)	total: 1m	remaining: 28m 5s
30:	test: 0.6964670	best: 0.6964670 (30)	total: 1m 22s	remaining: 25m 24s
40:	test: 0.6996729	best: 0.6996729 (40)	total: 2m 1s	remaining: 27m 42s
50:	test: 0.7006149	best: 0.7006149 (50)	total: 2m 29s	remaining: 27m 1s
60:	test: 0.7026471	best: 0.7027038 (58)	total: 3m 6s	remaining: 27m 40s
70:	test: 0.7069388	best: 0.7069388 (70)	total: 3m 29s	remaining: 26m 9s
80:	test: 0.7123677	best: 0.7123677 (80)	total: 4m 8s	remaining: 26m 47s
90:	test: 0.7177096	best: 0.7177096 (90)	total: 4m 44s	remaining: 26m 44s
100:	test: 0.7558764	best: 0.7558764 (100)	total: 5m 24s	remaining: 26m 54s
110:	test: 0.8094551	best: 0.8094551 (110)	total: 6m 11s	remaining: 27m 31s
120:	test: 0.8412920	best: 0.8412920 (120)	total: 6m 55s	remaining: 27m 39s
130:	test: 0.8645270	best: 0.8645270 (130)	tota

<catboost.core.CatBoostClassifier at 0x7f6eae258c90>

In [39]:
pred1 = cb_model.predict_proba(test_df[feats])[:,1:]
sub = pd.DataFrame()
sub['impression_id'] = test['impression_id']
sub['is_click'] = pred1
sub.head()

Unnamed: 0,impression_id,is_click
5290,ccf6d380a63293580f2247d840fca638,0.014422
46094,2a1b2179f709dc95fb4d819a8f3eb80d,0.013109
44940,159e11927eab144e1d24e3255978f111,0.013109
18646,e5e233880ea8d2a06943790dc3d37463,0.013241
38746,c80ea471e47bbbd551543ca8c0e102ea,0.04637


In [40]:
sub[['impression_id', 'is_click']].to_csv('cb-most-feats-complete-v5-ultimate.csv', index=False)