In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_age_gender = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')
df_countries = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/countries.csv.zip')
df_sessions = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')
df_test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
df_train = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_age_gender.head()

In [None]:
df_sessions.head()

In [None]:
df_sessions.info()

In [None]:
df_all = pd.concat([df_train.drop('country_destination',axis = 1),df_test],axis = 0)
df_all.reset_index(drop=True)

In [None]:
df_all.info()

In [None]:
missing = df_all.isnull().sum()
missing_sum = (missing[missing>0]).sort_values(ascending = False)
missing_ratio = (missing_sum/df_all.shape[0]).sort_values(ascending = False)
pd.concat([missing_sum,missing_ratio],keys = ['na_num','na_ratio'],axis = 1)

# age data clean

In [None]:
df_all.drop('date_first_booking',axis = 1,inplace = True)

In [None]:
df_all.age.describe()

In [None]:
age_index_1 =df_all.age>1000 
df_all[age_index_1].age.describe()

In [None]:
df_all.loc[df_all.age>1000,'age'] = 2015 - df_all.loc[age_index_1,'age']
df_all.loc[df_all.age>100,'age'] = np.nan
df_all.loc[df_all.age<18,'age'] = np.nan

In [None]:
sns.displot(df_all.age,bins=10)

In [None]:
cols = [ 'gender',
 'signup_method',
 'signup_flow',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'signup_app',
 'first_device_type',
 'first_browser']

for col in cols:
    print(df_all[col].value_counts())

# add new user action features

In [None]:
df_all['date_account_created'] = pd.to_datetime(df_all['date_account_created'])
df_all['timestamp_first_active'] = pd.to_datetime(df_all['timestamp_first_active'])

#用户行为和工作日、周末有很大关系
# dt.day_of_week,return 0-6 which stand for Mon-Sat
df_all['weekday_account_created'] = df_all.date_account_created.dt.strftime("%w")

df_all['day_account_created'] = df_all.date_account_created.dt.day
df_all['month_account_created'] = df_all.date_account_created.dt.month
df_all['year_account_created'] = df_all.date_account_created.dt.year

df_all['weekday_first_active'] = df_all.timestamp_first_active.dt.strftime("%w")

df_all['day_first_active'] = df_all.timestamp_first_active.dt.day
df_all['month_first_active'] = df_all.timestamp_first_active.dt.month
df_all['year_first_active'] = df_all.timestamp_first_active.dt.year

df_all['time_lag'] = (df_all['date_account_created'] - df_all['timestamp_first_active'])

In [None]:
df_all['time_lag'] = df_all['time_lag'].apply(lambda x:x.days)

In [None]:
df_all.drop( ['date_account_created', 'timestamp_first_active'], axis=1, inplace=True)

# df_sessions - missing data

In [None]:
missing = df_sessions.isnull().sum()

In [None]:
missing_sum = missing[missing > 0].sort_values(ascending = False)
missing_ratio = (missing_sum/df_sessions.shape[0]).sort_values(ascending = False)
pd.concat([missing_sum,missing_ratio],axis = 1, keys = ['missing_sum','missing_ratio'])

In [None]:
cols = ['action_type','device_type']

for col in cols:
    print('---------',col,'---------\n',df_sessions[col].value_counts())

cols = ['action','action_detail','secs_elapsed']
for col in cols:
    print('------',col,'------\n',df_sessions[col].value_counts().head(10))

In [None]:
df_sessions.rename(columns = {'user_id': 'id'}, inplace=True)

In [None]:
action_count = df_sessions.groupby(['id','action'])['secs_elapsed'].agg(len).unstack()
action_type_count = df_sessions.groupby(['id','action_type'])['secs_elapsed'].agg(len).unstack()
action_detail_count = df_sessions.groupby(['id', 'action_detail'])['secs_elapsed'].agg(len).unstack()
device_type_sum = df_sessions.groupby(['id', 'device_type'])['secs_elapsed'].agg(sum).unstack()

df_sessions_action = pd.concat([action_count, action_type_count, action_detail_count, device_type_sum],axis=1)
df_sessions_action.columns = df_sessions_action.columns.map(lambda x: str(x) + '_count')

df_sessions_action['most_used_device'] = df_sessions.groupby('id')['device_type'].max()

In [None]:
df_sessions_action.reset_index(inplace = True)
df_sessions_action.rename(columns = {'index': 'id'}, inplace=True)

In [None]:
secs_elapsed = df_sessions.groupby('id')['secs_elapsed']
secs_elapsed = secs_elapsed.agg(
    [('secs_elapsed_sum', np.sum),
    ('secs_elapsed_mean', np.mean),
    ('secs_elapsed_min', np.min),
    ('secs_elapsed_max', np.max),
    ('secs_elapsed_median',np.median),
    ('secs_elapsed_std',np.std),
    ('secs_elapsed_var',np.var),
    ('day_pauses',lambda x:(x>86400).sum()),
    ('long_pauses',lambda x:(x>300000).sum()),
    ('short_pauses',lambda x:(x<3600).sum()),
    ('session_length',np.count_nonzero)]
)

In [None]:
secs_elapsed.reset_index(inplace=True)

In [None]:
secs_elapsed.columns

In [None]:
sessions_secs_elapsed = pd.merge(df_sessions_action,secs_elapsed,on = 'id',how='left')
df_all = pd.merge(df_all, sessions_secs_elapsed, on='id', how = 'left')

df_all.drop('id',axis = 1,inplace = True)

In [None]:
duplicate_columns = df_all.columns[df_all.columns.duplicated()]
duplicate_columns

In [None]:
df_all = df_all.loc[:,~df_all.columns.duplicated()]

In [None]:
categorical_features = ['gender', 'signup_method', 'signup_flow', 'language','affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'most_used_device', 'weekday_account_created', 'weekday_first_active']
df_all = pd.get_dummies(df_all, columns=categorical_features)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score,make_scorer
from lightgbm import LGBMClassifier

In [None]:
le = LabelEncoder()
y = df_train['country_destination']
y = le.fit_transform(y)

id_test = df_test.id

train_num = df_train.shape[0]
X = df_all[:train_num]
X_sub = df_all[train_num:]

# X_trainval,X_test, y_trainval,y_test = train_test_split(X,y,test_size=.2)

In [None]:
from xgboost.sklearn import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

accuracy = make_scorer(accuracy_score)

kfold = StratifiedKFold(n_splits =4, shuffle=True, random_state=67)

xgb = XGBClassifier(gpu_id = 0, tree_method = 'gpu_hist',predictor='gpu_predictor',
                    n_estimators=50,
                    objective='multi:softprob',
                    eval_metric='mlogloss', # default:merror
                    colsample_bytree=0.5,
                    max_depth = 6,
                    learning_rate = .1,
                    subsample = .5)    

xgb.fit(X, y)

In [None]:
y_pred = xgb.predict_proba(X_sub)

In [None]:
id_test = df_test.id

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()
    
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)