In [80]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import Counter
import holidays
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
from catboost import Pool, CatBoostClassifier

In [3]:
#### read files
df_train = pd.read_csv("/Users/s0c02nj/Desktop/WNS2/train_NA17Sgz/train.csv")
df_log = pd.read_csv("/Users/s0c02nj/Desktop/WNS2/train_NA17Sgz/view_log.csv")
df_item = pd.read_csv("/Users/s0c02nj/Desktop/WNS2/train_NA17Sgz/item_data.csv")

In [4]:
df_test = pd.read_csv("/Users/s0c02nj/Desktop/WNS2/test.csv")

In [5]:
df_log_item = pd.merge(left = df_log, 
                  right= df_item, 
                  how='left',
                  left_on=['item_id'], 
                  right_on=['item_id'])

In [6]:
cat_cols = ['device_type', 'category_1', 'category_2', 'category_3', 'product_type', 'item_id']

for col in cat_cols:
    le = preprocessing.LabelEncoder()
    df_log_item[col] = le.fit_transform(df_log_item[col].astype(str))

In [7]:
##### server feature week, month, day and hour
df_log_item['server_time'] = pd.to_datetime(df_log_item['server_time'])

df_log_item['server_wk'] = df_log_item['server_time'].dt.week
df_log_item['server_mth'] = df_log_item['server_time'].dt.month
df_log_item['server_day'] = df_log_item['server_time'].dt.day
df_log_item['server_hr'] = df_log_item['server_time'].dt.hour
df_log_item['server_weekend'] = (df_log_item['server_time'].dt.weekday >= 5).astype(int)

In [8]:
### impute price
df_log_item['item_price'] = df_log_item['item_price'].apply(lambda x:np.log1p(x))
price_med = df_log_item['item_price'].median()
df_log_item['item_price'] = df_log_item['item_price'].replace(np.nan, price_med, regex = True)

In [9]:
df_log_item["cat1_cat2"] = df_log_item["category_1"].astype("str") + df_log_item["category_2"].astype("str")
df_log_item["cat1_cat3"] = df_log_item["category_1"].astype("str") + df_log_item["category_3"].astype("str")
df_log_item["cat2_cat3"] = df_log_item["category_2"].astype("str") + df_log_item["category_3"].astype("str")
df_log_item["cat1_cat2_cat3"] = df_log_item["category_1"].astype("str") + df_log_item["category_2"].astype("str") + df_log_item["category_3"].astype("str")


In [10]:
def q3(x):
    return np.percentile(x , 75)

def q1(x):
    return np.percentile(x , 25)

def mode(x):
    return Counter(x).most_common(1)[0][0]

def mode_1(x):
    try:
        return Counter(x).most_common(2)[1][0]
    except:
        return 999999

In [11]:
aggs = {}
aggs['item_price'] = ['median', 'mean', 'sum', 'max', 'min','var', q1, q3,  mode, mode_1]
aggs['device_type'] = ['count', 'nunique', mode, mode_1]
aggs['user_id'] = ['count', 'nunique']
aggs['session_id'] = ['count', 'nunique']
aggs['item_id'] =   ['nunique', mode, mode_1]
aggs['category_1'] = ['nunique', mode, mode_1]
aggs['category_2'] = ['nunique', mode, mode_1]
aggs['category_3'] = ['nunique', mode, mode_1]
aggs['cat1_cat2'] = ['nunique', mode, mode_1]
aggs['cat1_cat3'] = ['nunique', mode, mode_1]
aggs['cat2_cat3'] = ['nunique', mode, mode_1]
aggs['cat1_cat2_cat3'] = ['nunique', mode, mode_1]
aggs['product_type'] = ['nunique', mode, mode_1]


aggs['server_wk'] = ['nunique', mode, mode_1, 'mean']
aggs['server_mth'] = ['nunique', mode, mode_1, 'mean']
aggs['server_day'] = ['nunique', mode, mode_1,  'mean']
aggs['server_hr'] = ['nunique', mode, mode_1, 'mean']
aggs['server_weekend'] = ['nunique', mode, mode_1, 'sum']

In [12]:
is_click = df_train["is_click"]
df_train = df_train.drop(["is_click"], axis=1)
df_combined = pd.concat([df_train, df_test])

In [13]:
for col in tqdm(["user_id"]):
    
    aggs_temp = aggs.copy()
    aggs_temp.pop(col)
    
    agg_df = df_log_item.groupby(col).agg(aggs_temp).reset_index()
    agg_df.columns = [col] + [col + '_' + c[0] +'_' + c[1] for c in agg_df.columns[1:]]
    
    df_final = pd.merge(left = df_combined, right= agg_df, how='left',
                    left_on=[col], right_on=[col])
    

100%|██████████| 1/1 [02:17<00:00, 137.10s/it]


In [15]:
df_final.columns

Index(['impression_id', 'impression_time', 'user_id', 'app_code', 'os_version',
       'is_4G', 'user_id_item_price_median', 'user_id_item_price_mean',
       'user_id_item_price_sum', 'user_id_item_price_max',
       'user_id_item_price_min', 'user_id_item_price_var',
       'user_id_item_price_q1', 'user_id_item_price_q3',
       'user_id_item_price_mode', 'user_id_item_price_mode_1',
       'user_id_device_type_count', 'user_id_device_type_nunique',
       'user_id_device_type_mode', 'user_id_device_type_mode_1',
       'user_id_session_id_count', 'user_id_session_id_nunique',
       'user_id_item_id_nunique', 'user_id_item_id_mode',
       'user_id_item_id_mode_1', 'user_id_category_1_nunique',
       'user_id_category_1_mode', 'user_id_category_1_mode_1',
       'user_id_category_2_nunique', 'user_id_category_2_mode',
       'user_id_category_2_mode_1', 'user_id_category_3_nunique',
       'user_id_category_3_mode', 'user_id_category_3_mode_1',
       'user_id_cat1_cat2_nunique', 

In [23]:
cont_cols = ['user_id_item_price_mean',
       'user_id_item_price_sum', 'user_id_item_price_max',
       'user_id_item_price_min', 'user_id_item_price_var',
       'user_id_item_price_q1', 'user_id_item_price_q3',
       'user_id_item_price_mode', 'user_id_item_price_mode_1']

In [24]:
df_final['impression_time'] = pd.to_datetime(df_final['impression_time'])

df_final['impression_wk'] = df_final['impression_time'].dt.week
df_final['impression_mth'] = df_final['impression_time'].dt.month
df_final['impression_day'] = df_final['impression_time'].dt.day
df_final['impression_hr'] = df_final['impression_time'].dt.hour

df_final['impression_weekend'] = (df_final['impression_time'].dt.weekday >= 5).astype(int)

In [26]:
df_final['user_time_int1'] = df_final['impression_hr'] + df_final['user_id_item_price_mean']
df_final['user_time_int2'] = df_final['impression_day'] + df_final['user_id_item_price_mean']
df_final['session_cat_int1'] = df_final['user_id_session_id_nunique'] + df_final['user_id_item_price_mean']
df_final['app_price_int'] = df_final['app_code'] + df_final['user_id_item_price_mean']


In [30]:
df_embed = pd.read_csv('/Users/s0c02nj/Desktop/WNS2/golden_feats/user_embed_20.csv')

In [32]:
col = 'user_id'
df_final1 = pd.merge(left = df_final, right= df_embed, how='left',
                    left_on=[col], right_on=[col])

In [33]:
df_final = df_final.drop(['impression_id', 'impression_time', 'user_id'], axis =1)

In [34]:
df_final1 = df_final1.drop(['impression_id', 'impression_time', 'user_id'], axis =1)

In [35]:
col_counts = []

for col in tqdm(df_final.columns):
    
    counter = Counter(df_final[col])
    col_counts.append(str(col)+'_count')
    
    df_final1[str(col)+'_count'] = df_final1[col].apply(lambda x:counter[x])

100%|██████████| 75/75 [00:33<00:00,  5.35it/s]


In [37]:
df_final = df_final1

In [38]:
cat_appended_cols = []
for col in df_final.columns:
    if df_final[col].dtype == "object":
        cat_appended_cols.append(col)
        print(col , df_final[col].dtype)

user_id_cat1_cat2_mode object
user_id_cat1_cat2_mode_1 object
user_id_cat1_cat3_mode object
user_id_cat1_cat3_mode_1 object
user_id_cat2_cat3_mode object
user_id_cat2_cat3_mode_1 object
user_id_cat1_cat2_cat3_mode object
user_id_cat1_cat2_cat3_mode_1 object


In [39]:
col_counts = []

for col in tqdm(df_final.columns):
    
    counter = Counter(df_final[col])
    col_counts.append(str(col)+'_count')
    
    df_final1[str(col)+'_count'] = df_final1[col].apply(lambda x:counter[x])

100%|██████████| 170/170 [00:43<00:00,  7.33it/s]


In [44]:
df_final = df_final1

In [45]:
cat_appended_cols = []
for col in df_final.columns:
    if df_final[col].dtype == "object":
        cat_appended_cols.append(col)
        print(col , df_final[col].dtype)

user_id_cat1_cat2_mode object
user_id_cat1_cat2_mode_1 object
user_id_cat1_cat3_mode object
user_id_cat1_cat3_mode_1 object
user_id_cat2_cat3_mode object
user_id_cat2_cat3_mode_1 object
user_id_cat1_cat2_cat3_mode object
user_id_cat1_cat2_cat3_mode_1 object


In [46]:
cat_cols_train = ['app_code', 'os_version'] + cat_appended_cols

for col in cat_cols_train:
    print(col)
    le = preprocessing.LabelEncoder()
    df_final[col] = le.fit_transform(df_final[col].astype(int))

app_code
os_version
user_id_cat1_cat2_mode
user_id_cat1_cat2_mode_1
user_id_cat1_cat3_mode
user_id_cat1_cat3_mode_1
user_id_cat2_cat3_mode
user_id_cat2_cat3_mode_1
user_id_cat1_cat2_cat3_mode
user_id_cat1_cat2_cat3_mode_1


In [47]:
cont_cols_total = cont_cols + list(cont_cols2)

In [48]:
cat_cols_total = [i for i in df_final.columns if i not in cont_cols_total]

In [84]:
for col in tqdm(cat_cols_total):
    le = LabelEncoder()
    df_final[col] = le.fit_transform(df_final[col].astype(str))

100%|██████████| 236/236 [00:52<00:00,  4.70it/s]


In [85]:
x_train = df_final[0 : df_train.shape[0]]
x_test = df_final[df_train.shape[0] : ]
y = is_click

In [86]:
x_t, x_v, y_t, y_v = train_test_split(x_train, y, stratify=y, random_state=42, test_size=0.2)

In [87]:
#let us make the catboost model, use_best_model params will make the model prevent overfitting
model = CatBoostClassifier(iterations=500, 
                           learning_rate=0.01, 
                           l2_leaf_reg=3.5, 
                           depth=8, 
                           rsm=0.98, 
                           loss_function= 'Logloss', 
                           eval_metric='AUC',
                           use_best_model=True,
                           random_seed=42)

In [88]:
model.fit(x_t,y_t,cat_features=cat_cols_total, eval_set=(x_v,y_v))


0:	test: 0.5000000	best: 0.5000000 (0)	total: 1.81s	remaining: 15m 4s
1:	test: 0.5026345	best: 0.5026345 (1)	total: 4.68s	remaining: 19m 26s
2:	test: 0.5014290	best: 0.5026345 (1)	total: 5.27s	remaining: 14m 33s
3:	test: 0.5025315	best: 0.5026345 (1)	total: 7s	remaining: 14m 28s
4:	test: 0.5024902	best: 0.5026345 (1)	total: 7.61s	remaining: 12m 33s
5:	test: 0.5355707	best: 0.5355707 (5)	total: 12.3s	remaining: 16m 56s
6:	test: 0.5525658	best: 0.5525658 (6)	total: 15.1s	remaining: 17m 44s
7:	test: 0.5526746	best: 0.5526746 (7)	total: 16.1s	remaining: 16m 27s
8:	test: 0.5553484	best: 0.5553484 (8)	total: 17.2s	remaining: 15m 40s
9:	test: 0.5581728	best: 0.5581728 (9)	total: 20.2s	remaining: 16m 31s
10:	test: 0.5589678	best: 0.5589678 (10)	total: 20.9s	remaining: 15m 30s
11:	test: 0.5637173	best: 0.5637173 (11)	total: 25.8s	remaining: 17m 29s
12:	test: 0.5656592	best: 0.5656592 (12)	total: 26.5s	remaining: 16m 32s
13:	test: 0.5656865	best: 0.5656865 (13)	total: 27.1s	remaining: 15m 42s
14

KeyboardInterrupt: 