In [None]:
import pandas as pd
from datetime import datetime
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from sklearn.metrics import ndcg_score

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
session=pd.read_csv("/kaggle/input/airbnb-recruiting-new-user-bookings/sessions.csv.zip")
print(session.shape)
session.head()

In [None]:
train_user=pd.read_csv("/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip")
print(train_user.shape)
train_user.head()

In [None]:
test_user=pd.read_csv("/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv.zip")
print(test_user.shape)
test_user.head()

In [None]:
class Custom_Proccess(BaseEstimator, TransformerMixin):
   
    def transform(self,X,y=None):

        #NULL
        X.gender.replace('-unknown-', 'OTHER', inplace=True)
        X['age'].fillna(-1,inplace=True)
        
        X['timestamp_first_active']=X['timestamp_first_active'].apply(lambda s:datetime(year=int(str(s)[0:4]), month=int(str(s)[4:6]), 
                                                                                          day=int(str(s)[6:8])).strftime('%Y-%m-%d'))
        
        X['timestamp_first_active']=X['timestamp_first_active'].astype('datetime64[ns]')
        X['age']=X['age'].astype('int64')
        X['date_account_created']=X['date_account_created'].astype('datetime64[ns]')

        X['dac_year']=X['date_account_created'].dt.year
        X['dac_month']=X['date_account_created'].dt.month
        X['dac_day']=X['date_account_created'].dt.day

        X['tfa_year']=X['timestamp_first_active'].dt.year
        X['tfa_month']=X['timestamp_first_active'].dt.month
        X['tfa_day']=X['timestamp_first_active'].dt.day

        ## Removing wrong age
        ###X=X[(X.age>10)&(X.age<100)|(X.age==-1)].reset_index(drop=True)

        X.signup_app.replace(['iOS','Android'],'SmartDevice',inplace=True)

        X.drop(['date_first_booking','date_account_created','timestamp_first_active','first_device_type','first_browser'],axis=1,inplace=True)
        
        return X
    
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
session.drop(['action_detail','device_type'],inplace=True,axis=1)
session.dropna(subset=['user_id','action'],inplace=True)
session.action_type=session.action_type.fillna('Other')
session.secs_elapsed=session.secs_elapsed.fillna(0)

In [None]:
session_group=session.groupby(['user_id','action_type']).agg({'action':'count','secs_elapsed':'sum'}).reset_index()
session_df=pd.get_dummies(session_group,columns=['action_type']).groupby(['user_id']).sum().reset_index()
session_df.head()

In [None]:
train_user_df=train_user.merge(session_df,left_on=['id'],
                               right_on=['user_id'],how='left').drop(['user_id'],axis=1).reset_index(drop=True)
train_user_df.shape

In [None]:
test_user_df=test_user.merge(session_df,left_on=['id'],
                               right_on=['user_id'],how='left').drop(['user_id'],axis=1).reset_index(drop=True)
test_user_df.shape

In [None]:
train_user_df.secs_elapsed.fillna(-1,inplace=True)
train_user_df.action.fillna(-1,inplace=True)
train_user_df.iloc[:,-11:]=train_user_df.iloc[:,-11:].fillna(-1)

train_user_df['secs_elapsed']=train_user_df['secs_elapsed'].astype('int64')
train_user_df['action']=train_user_df['action'].astype('int64')

In [None]:
test_user_df.secs_elapsed.fillna(-1,inplace=True)
test_user_df.action.fillna(-1,inplace=True)
test_user_df.iloc[:,-11:]=test_user_df.iloc[:,-11:].fillna(-1)

test_user_df['secs_elapsed']=test_user_df['secs_elapsed'].astype('int64')
test_user_df['action']=test_user_df['action'].astype('int64')

In [None]:
train_user_df.isnull().sum()/train_user_df.shape[0] *100

In [None]:
categorical_cols=[cname for cname in train_user_df.columns if cname not in ['id','date_account_created','date_first_booking','first_device_type','first_browser',
                                                                            'timestamp_first_active','country_destination'] and 
                  train_user_df[cname].dtype == "object"]

numerical_cols=[cname for cname in train_user_df.columns if cname not in ['id','date_account_created','date_first_booking','first_device_type','first_browser',
                                                                          'timestamp_first_active','country_destination'] and 
                train_user_df[cname].dtype != "object"]

print("Categorical_cols - \n",categorical_cols)
print("Numerical_cols - \n",numerical_cols)

In [None]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])

In [None]:
test_id = test_user_df.id
test_X = test_user_df.drop(['id'], axis='columns')

In [None]:
labels = train_user_df.country_destination
le = LabelEncoder()
train_y = le.fit_transform(labels) 

In [None]:
train_X = train_user_df.drop(['id','country_destination'], axis='columns')

In [None]:
train_X.shape,train_user_df.shape,len(train_y),train_user.shape

In [None]:
test_user_df.shape,test_X.shape

In [None]:
def cross_validation_with_ndcg(pipe, X, y, scorer, cv=5):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=100)
    # initialize score array
    scores = []
    for train_index, holdout_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[holdout_index]
        y_train, y_test = y[train_index], y[holdout_index]
        pipe.fit(X_train, y_train)
        predict = pipe.predict_proba(X_test)
        y_test = pd.get_dummies(y_test).to_numpy()
        score = scorer(y_test, predict)
        scores.append(round(score, 6))
        print(f'{len(scores)} / {cv} DONE!', end='\r')
    return scores

In [None]:
#grid search
n_estimaters_param=[50, 100, 200]
max_depth_param=[3,4,5]
learning_rate_param=[0.1,0.2]

params = [(x, y, z) for x in learning_rate_param for y in n_estimaters_param for z in max_depth_param]

result_list=[]

for learning_rates,n_estimaters, max_depth in params:
    xg_model_ = XGBClassifier(max_depth=max_depth,learning_rate=learning_rates, n_estimators=n_estimaters,verbosity=0,
                              objective='multi:softprob',n_jobs=-1)
    search_pipe = Pipeline([
        ('customproccess',Custom_Proccess()),
        ('preprocessor', preprocessor),
        ("model", xg_model_)
    ])
    print(f'learning_rate: {learning_rates}, n_estimaters: {n_estimaters}, max_depth: {max_depth}')
    scores = cross_validation_with_ndcg(search_pipe, train_X, train_y, ndcg_score)
    result_list.append([learning_rates,n_estimaters,max_depth,np.mean(scores)])

In [None]:
result_df=pd.DataFrame(result_list,columns=['learning_rate','n_estimator','max_depth','mean_score'])
result_df.sort_values(by='mean_score',ascending=False).head(5)

In [None]:
xg_model = XGBClassifier(max_depth=5,learning_rate=0.1, n_estimators=200,verbosity=0,objective='multi:softprob',n_jobs=-1)
pipe = Pipeline([
    ('customproccess',Custom_Proccess()),
    ('preprocessor', preprocessor),
    ("model", xg_model)
])
pipe.fit(train_X, train_y)
predict = pipe.predict_proba(test_X)

In [None]:
ids = []
cts = []
for i in range(len(test_id)):
    idx = test_id[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(predict[i])[::-1])[:5].tolist()

In [None]:
sub_df = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub_df.to_csv('sub-03.csv',index=False)