# Airbnb New User Bookings

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

## Read in file

In [None]:
train = pd.read_csv("../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip")
train.head()

In [None]:
test = pd.read_csv("../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip")
test.head()

In [None]:
train.info()

In [None]:
test.info()

## Concat files

In [None]:
# store the country_destination
labels = train['country_destination'].values
train = train.drop(['country_destination'], axis=1)

# store the test id
test_id = test['id']

# caculate the row number 
train_row = train.shape[0]

# concat file
df = pd.concat((train, test), axis=0, ignore_index=True)

# removing id and date_first_booking
df = df.drop(['id', 'date_first_booking', 'signup_app'], axis=1)

# filling nan
df = df.fillna(-1)


## Data Preprocssing

### Date Account Create

In [None]:
df['date_account_created'] = pd.to_datetime(df['date_account_created'], format='%Y-%m-%d')

df['dac_year'] = pd.DatetimeIndex(df['date_account_created']).year
df['dac_month'] = pd.DatetimeIndex(df['date_account_created']).month
df['dac_day'] = pd.DatetimeIndex(df['date_account_created']).day

df = df.drop(['date_account_created'], axis = 1)

df.head()

### Timestamp first active

In [None]:
tfa = np.vstack(df.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)

df['tfa_year'] = tfa[:,0]
df['tfa_month'] = tfa[:,1]
df['tfa_day'] = tfa[:,2]

df = df.drop(['timestamp_first_active'], axis=1)

In [None]:
df.head()

### Age

In [None]:
def user_age(age):
    if age < 0:
        return 'NA' 
    elif (age < 15):
        return 15 
    elif (age <= 25):
        return 25 
    elif (age <= 35):
        return 35
    elif (age <= 45):
        return 45
    elif (age <= 55):
        return 55
    elif (age <= 65):
        return 65
    elif (age <= 75):
        return 75
    elif (age <= 85):
        return 85
    else:
        return 'NA' 

df['age'] = np.array([user_age(x) for x in df.age])
df_age = pd.get_dummies(df.age, prefix = 'age')
df = pd.concat((df, df_age), axis = 1)

df.drop(['age'], axis = 1, inplace = True)

In [None]:
df.head()

### One Hot Encoding

In [None]:
OHE_feat = ['gender', 
            'signup_method', 
            'signup_flow', 
            'language', 
            'affiliate_channel', 
            'affiliate_provider', 
            'first_affiliate_tracked', 
            'first_device_type', 
            'first_browser']

for f in OHE_feat:
    df_work = pd.get_dummies(df[f], prefix=f, dummy_na=True)
    df.drop([f], axis = 1, inplace = True)
    df = pd.concat((df, df_work), axis = 1)

## Label Encoding

In [None]:
X = df.iloc[:train_row, :]
X_test = df.iloc[train_row:, :]

le = LabelEncoder()
y = le.fit_transform(labels)   

## XGBoost

In [None]:
xgb = XGBClassifier(max_depth=6, learning_rate=0.2, n_estimators=43,
                    objective='multi:softprob', subsample=1, colsample_bytree=0.5, seed=1)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

In [None]:
print(y_pred)

In [None]:
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(test_id)):
    idx = test_id[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('submission.csv',index=False)