In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2

import pandas as pd
pd.options.display.max_columns=1000 
pd.options.display.max_rows=1000
import numpy as np
import seaborn as sns
from sklearn.utils import shuffle
from scipy import stats
import itertools

In [None]:
import sys
sys.path.append('ml_modules/')

In [None]:
train = pd.read_csv('train.csv')
train['impression_time'] = pd.to_datetime(train.impression_time)
train.shape
# train = train[train.impression_time < pd.to_datetime('2018-12-12')].reset_index(drop=True)
train.shape

test = pd.read_csv('test.csv')
test['impression_time'] = pd.to_datetime(test.impression_time)
test.shape

In [None]:
train.head()

In [None]:
# (pd.Series(test.user_id.unique()).isin(train.user_id)).mean()
# for dt in test.impression_time.dt.date.unique():
#     print (dt, test[test.impression_time.dt.date == dt].shape[0], pd.Series(test[test.impression_time.dt.date == dt].user_id.unique()).isin(train.user_id).mean())

In [None]:
df = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)
df.shape

In [None]:
train_ids=train.impression_id
test_ids=test.impression_id
len(train_ids),len(test_ids)

In [None]:
df.sort_values(by=['user_id', 'impression_time'], inplace=True)

In [None]:
df['last_is_click'] = df.is_click.shift(1)
df.loc[df.user_id != df.user_id.shift(1), 'last_is_click'] = np.nan

In [None]:
df.sort_values(by=['app_code', 'impression_time'], inplace=True)

In [None]:
df['last_app_code'] = df.app_code.shift(1)
df.loc[df.user_id != df.user_id.shift(1), 'last_app_code'] = np.nan

In [None]:
df['same_last_app_code']= (df.last_app_code == df.app_code).astype(int)

In [None]:
id_order=pd.read_csv('id_order.csv')

In [None]:
df=id_order.merge(df,on='impression_id',how='left')

In [None]:
train = df[df.impression_id.isin(train_ids)]
test = df[df.impression_id.isin(test_ids)]
train.shape, test.shape

In [None]:
train.head()

In [None]:
hist_imp_variables = pd.read_csv('hist_imp_variables.csv')
hist_imp_variables.shape
hist_imp_variables.head()

In [None]:
columns_to_keep = list(filter(lambda x: not(x.startswith('hist_7_days') or x.startswith('hist_30_days')), hist_imp_variables.columns))
hist_imp_variables = hist_imp_variables[columns_to_keep]
hist_imp_variables.shape
hist_imp_variables.head()                  

In [None]:
# hist_sessions_vars = pd.read_csv('hist_sessions_vars.csv')
# hist_sessions_vars.shape
# hist_sessions_vars.head()
# # hist_sessions_vars.columns

In [None]:
hist_sessions_vars_7d = pd.read_csv('hist_sessions_vars_7d.csv')
hist_sessions_vars_7d.shape
hist_sessions_vars_7d.head()
# hist_sessions_vars_7d.columns

In [None]:
item_agg_vars = pd.read_csv('item_agg_vars.csv')
item_agg_vars.shape
item_agg_vars.head()
item_agg_vars.columns

In [None]:
# item_agg_vars_7d = pd.read_csv('item_agg_vars_7d.csv')
# item_agg_vars_7d.shape
# item_agg_vars_7d.head()
# item_agg_vars_7d.columns

In [None]:
last_item_viewed = pd.read_csv('last_item_viewed.csv')
last_item_viewed.shape
last_item_viewed.head()

In [None]:
last_item_viewed.columns

In [None]:
# last_item_viewed_7d = pd.read_csv('last_item_viewed_7d.csv')
# last_item_viewed_7d.shape
# last_item_viewed_7d.head()
# last_item_viewed_7d.columns

In [None]:
click_vars = pd.read_csv('more_click_vars.csv')
# click_vars = click_vars[[c for c in click_vars.columns if not c.endswith('_last_not_nan') ]]
# click_vars = click_vars[[c for c in click_vars.columns if c.startswith('hist_all') or c == 'impression_id' ]]
click_vars.shape
click_vars.head()

In [None]:
[c for c in click_vars.columns if c.startswith('hist_all') or c == 'impression_id' ]


In [None]:
user_features = pd.read_csv('user_features.csv')
user_features = user_features[[c for c in user_features.columns if 'is_click' not in c]]
user_features.shape
user_features.head()
user_features.columns

In [None]:
all_variables = hist_imp_variables.merge(
    click_vars, how='inner').merge(   
    hist_sessions_vars_7d, how='left').merge(
    item_agg_vars, how='left').merge(
    last_item_viewed, how='left')

all_variables.shape
all_variables.head()

In [None]:
train = train.merge(all_variables, how='left')
test = test.merge(all_variables, how='left')
train.shape, test.shape

In [None]:
train = train.merge(user_features, how='left')
test = test.merge(user_features, how='left')
train.shape, test.shape

In [None]:
train.head()

In [None]:
# train = train.sort_values(by = 'impression_time').reset_index(drop=True)

In [None]:
def create_date_time_variables(df, date_key):
    dates = pd.to_datetime(df[date_key])
    df['{}_{}'.format(date_key, 'week')] = dates.dt.week
    df['{}_{}'.format(date_key, 'month')] = dates.dt.month
    df['{}_{}'.format(date_key, 'year')] = dates.dt.year
    df['{}_{}'.format(date_key, 'dayofweek')] = dates.dt.dayofweek
    df['{}_{}'.format(date_key, 'dayofmonth')] = dates.dt.day
    df['{}_{}'.format(date_key, 'dayofyear')] = dates.dt.dayofyear
    
    df['{}_{}'.format(date_key, 'hour')] = dates.dt.hour
    df['{}_{}'.format(date_key, 'hour')] = dates.dt.minute
    
    return df 

In [None]:
for k in ['impression_time', 'last_server_time', 'last_7d_server_time']:
    if k in train.columns:
        train = create_date_time_variables(train, k)
        test = create_date_time_variables(test, k)
        print(k)

In [None]:
train.shape, test.shape

In [None]:
train.columns.tolist()

In [None]:
id_col = 'impression_id'
target_col = 'is_click'

columns_to_drop = [
    'impression_time', 
    'last_7d_server_time', 
    'last_server_time'
]

columns_to_drop = [c for c in columns_to_drop if c in train.columns]

cat_cols = [
    'user_id', 
    'app_code', 
    'os_version', 
    
    'last_app_code',
    
    'last_device_type', 
    'last_item_id',
    'last_category_1', 
    'last_category_2',
    'last_category_3', 
    'last_product_type',
    
    'last_7d_device_type', 
    'last_7d_item_id', 
    'last_7d_category_1',
    'last_7d_category_2', 
    'last_7d_category_3', 
    'last_7d_product_type',
    

    'hist_category_1_mode',
    'hist_category_2_mode',
    'hist_category_3_mode',
    'hist_product_type_mode',
    
    'hist_7d_category_1_mode', 
    'hist_7d_category_2_mode', 
    'hist_7d_category_3_mode', 
    'hist_7d_product_type_mode',
    
    
    'user_feats_app_code_mode',
    'user_feats_os_version_mode',
    'user_feats_item_id_mode',
    'user_feats_device_type_mode', 
    'user_feats_category_1_mode',
    'user_feats_category_2_mode',
    'user_feats_category_3_mode',
    'user_feats_product_type_mode',


]
cat_cols = [c for c in cat_cols if c in train.columns]
cat_cols

In [None]:
# imputing categorical columns
train[cat_cols] = train[cat_cols].fillna('Missing')
test[cat_cols] = test[cat_cols].fillna('Missing')

# imputing numerical columns
train = train.fillna(-1)
test = test.fillna(-1)

In [None]:
train.select_dtypes(include=['object']).columns

In [None]:
# train.impression_time.dt.date.value_counts().sort_index()

In [None]:
# folds = []
# for x, y in [('2018-12-05', None),
# #              ('2018-11-28', '2018-12-05')
#             ]:
#     print (x, y)
#     train_index = list(train[train.impression_time <= pd.to_datetime(x)].index)
#     val_index = list(train[train.impression_time > pd.to_datetime(x)].index)
    
#     train.loc[train_index].shape, train.loc[val_index].shape, train.loc[train_index].is_click.mean()    
#     folds.append((train_index, val_index))
    
    

In [None]:
from custom_estimator import Estimator
from encoding import FreqeuncyEncoding
from custom_fold_generator import FoldScheme
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [None]:
cat_cols

In [None]:
fE = FreqeuncyEncoding(categorical_columns=cat_cols, return_df=True)

In [None]:
# x = fE.fit_transform(pd.concat([train, test], axis=0))

In [None]:
train = fE.fit_transform(train)
test = fE.transform(test)

In [None]:
train.head()

In [None]:
# val_index

In [None]:
test_ids = test.impression_id
train_ids = train.impression_id

In [None]:
y = train[target_col]
train.drop(columns=[x for x in [id_col] + columns_to_drop + [target_col] if x in train.columns], inplace=True)
test.drop(columns=[x for x in [id_col] + columns_to_drop + [target_col] if x in test.columns], inplace=True)

In [None]:
train.shape
train.head()

In [None]:
# y.value_counts()
[1,2,3,4][-1:]

In [None]:
est = Estimator(LGBMClassifier(**{
    'n_estimators': 20000, 
    'learning_rate': 0.01,
    'boosting_type': 'gbdt', 
    'colsample_bytree': 0.60, 
    'min_child_weight': 9.0, 
    'num_leaves': 39, 
    'objective': 'binary', 
    'subsample': 0.70, 
    'subsample_freq': 5,
    'metric': 'custom'
    
      }), early_stopping_rounds=400,eval_metric='AUC', scoring_metric=roc_auc_score
)

### for state 200

In [None]:
# est.random_state=200
est.random_state

In [None]:
oof=est.fit_transform(train.values,y.values)
pred=est.transform(test.values)

In [None]:
est.avg_cv_score

In [None]:
pd.DataFrame({'impression_id':test_ids,'is_click':pred}).to_csv('preds_lgb_7546_pat.csv',index=False)
pd.DataFrame({'impression_id':train_ids,'is_click':oof}).to_csv('oof_lgb_7546_pat.csv',index=False)