In [None]:
import pandas as pd
import re
import numpy as np
from datetime import datetime, date
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

from scipy.sparse import coo_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy import sparse
from scipy.sparse import csr_matrix

from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from scipy.stats import randint as sp_randint
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import pickle
import joblib
from IPython.display import Image


In [None]:
#Loading data
#Loading the Data
train_df = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip')
piv_train = train_df.shape[0]

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
train_df.id.value_counts().sum()

In [None]:
train_df.info()

In [None]:
sessions_df  = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')

In [None]:
sessions_df.head()

In [None]:
sessions_df.info()

In [None]:
sessions_df.shape

In [None]:
sessions_df.isnull().sum()

In [None]:
sessions_df["action_type"].value_counts()

In [None]:
sessions_df["action"].value_counts()

## Feature Engineering

In [None]:
sessions_df = sessions_df.groupby("user_id", as_index= False).agg(lambda x:x.tolist())

In [None]:
sessions_df.head()

In [None]:
def convert_to_string(action):
    action = [str(i) for i in action]
    action = [re.sub("nan","",i) for i in action]
    action = ",".join(action)
    
    return action

In [None]:
sessions_df["action"] = sessions_df["action"].apply(convert_to_string)
sessions_df["action_type"] = sessions_df["action_type"].apply(convert_to_string)
sessions_df["action_detail"] = sessions_df["action_detail"].apply(convert_to_string)


In [None]:
sessions_df.isnull().sum()

In [None]:
def convert_to_set(device):
    device = [str(i) for i in device]
    device = [re.sub("nan","",i) for i in device]
    device = ",".join(set(device))
    
    return device

In [None]:
sessions_df['device_type'] =sessions_df['device_type'].apply(convert_to_set)

In [None]:
def convert_the_time(time):
    
    float_time = []
    time = [str(i) for i in time]
    time = [re.sub("nan","",i) for i in time]
    
    for i in time:
        try:
            float_time.append(float(i))
        except ValueError :
            continue


    time = sum(float_time)
    
    return time

In [None]:
sessions_df['secs_elapsed'] = sessions_df['secs_elapsed'].apply(convert_the_time)

In [None]:
sessions_df.head()

In [None]:
sessions_df.shape

## Merge the two tables

In [None]:
# Join train and session df

train_merge = train_df.merge(sessions_df, left_on='id', right_on='user_id', how='left')

print("Train  :",train_df.shape)

print("Session:",sessions_df.shape)

print("Merge  :",train_merge.shape)

print("No of users in Train Data with session info:",train_merge.shape[0])

print("{} / {} = {}".format(train_merge.shape[0],train_df.shape[0],np.round((train_merge.shape[0]/train_df.shape[0]),2)))

In [None]:
train_merge.isnull().sum()

In [None]:
train_merge.tail()

### Read Test data

In [None]:
test_df  = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip')
id_test = test_df['id']


In [None]:
test_df.age

In [None]:
test_df.shape

In [None]:
print("Train data columns : ", train_df.columns)
print("Test data columns : ", test_df.columns)

In [None]:
test_df.isnull().sum()

In [None]:
test_merge = test_df.merge(sessions_df, left_on= "id", right_on="user_id", how = "inner")
print("test:", test_df.shape)
print("sessions_df:", sessions_df.shape)
print("test_merge:", test_merge.shape)
print("{} / {} = {}".format(test_merge.shape[0],test_df.shape[0],np.round((test_merge.shape[0]/test_df.shape[0]),2)))

    1 % of the test data don't have sessions details
    
    we will use left join

In [None]:
test_merge = test_df.merge(sessions_df, left_on= "id", right_on="user_id", how = "left")
print("test:", test_df.shape)
print("sessions_df:", sessions_df.shape)
print("test_merge:", test_merge.shape)
print("{} / {} = {}".format(test_merge.shape[0],test_df.shape[0],np.round((test_merge.shape[0]/test_df.shape[0]),2)))

In [None]:
#Checking null values
test_merge.isnull().sum()

    we will have 428 row with no sessions informations >> 428 null values ( 1% )

In [None]:
test_merge["user_id"].fillna("na", inplace = True)
test_merge["action"].fillna("na", inplace = True)
test_merge["action_type"].fillna("na", inplace = True)
test_merge["action_detail"].fillna("na", inplace = True)
test_merge["device_type"].fillna("na", inplace = True)
test_merge["secs_elapsed"].fillna(0, inplace = True)

train_merge["user_id"].fillna("na", inplace = True)
train_merge["action"].fillna("na", inplace = True)
train_merge["action_type"].fillna("na", inplace = True)
train_merge["action_detail"].fillna("na", inplace = True)
train_merge["device_type"].fillna("na", inplace = True)
train_merge["secs_elapsed"].fillna(0, inplace = True)

test_merge["age"].fillna(34.0, inplace = True)
train_merge["age"].fillna(34.0, inplace = True)


In [None]:
test_merge.isnull().sum()

In [None]:
test_merge.head()

In [None]:
train_merge["date_account_created"] = pd.to_datetime(train_merge["date_account_created"])
train_merge["date_account_created_day"] = train_merge.date_account_created.dt.weekday
train_merge["date_account_created_month"] = train_merge.date_account_created.dt.month
train_merge["date_account_created_year"] = train_merge.date_account_created.dt.year

test_merge["date_account_created"] = pd.to_datetime(test_merge["date_account_created"])
test_merge["date_account_created_day"] = test_merge.date_account_created.dt.weekday
test_merge["date_account_created_month"] = test_merge.date_account_created.dt.month
test_merge["date_account_created_year"] = test_merge.date_account_created.dt.year

In [None]:
def convert_timestamp_first_active(timestamp):
        
        timestamp = str(timestamp)
        
        timestamp = datetime(year=int(timestamp[0:4]), month=int(timestamp[4:6]), day=int(timestamp[6:8]),\
                             
                             hour=int(timestamp[8:10]), minute=int(timestamp[10:12]), second=int(timestamp[12:]))
        
        return timestamp

In [None]:
train_merge['timestamp_first_active'] = pd.to_datetime(train_merge.timestamp_first_active.apply(convert_timestamp_first_active))
train_merge["timestamp_first_active_day"] = train_merge.timestamp_first_active.dt.weekday
train_merge["timestamp_first_active_month"] = train_merge.timestamp_first_active.dt.month
train_merge["timestamp_first_active_year"] = train_merge.timestamp_first_active.dt.year
train_merge["timestamp_first_active_hour"] = train_merge.timestamp_first_active.dt.hour

test_merge['timestamp_first_active'] = pd.to_datetime(test_merge.timestamp_first_active.apply(convert_timestamp_first_active))
test_merge['timestamp_first_active_day'] = test_merge.timestamp_first_active.dt.weekday
test_merge['timestamp_first_active_month'] = test_merge.timestamp_first_active.dt.month
test_merge['timestamp_first_active_year'] = test_merge.timestamp_first_active.dt.year
test_merge['timestamp_first_active_hour'] = test_merge.timestamp_first_active.dt.hour 

In [None]:
plt.hist(test_merge["age"])
plt.show()

In [None]:
def median_age(age):
    
    if age< 15.0 or age > 100.0:
        return 34.0
    else:
        return age
    

In [None]:
train_merge["age"] = train_merge["age"].apply(median_age)

test_merge["age"] = test_merge["age"].apply(median_age)

In [None]:
test_merge['age']

In [None]:
bins = [i for i in range (15,106,5)]

def make_age_buckets(age):
    
    for i in range (len(bins)):
        if age < bins[i]:
            return i

In [None]:
train_merge['age'] = train_merge['age'].apply(lambda x :make_age_buckets(x))
test_merge['age'] = test_merge['age'].apply(lambda x :make_age_buckets(x))

In [None]:
train_merge.isnull().sum()

In [None]:
# mode replacement for first_affiliate_tracked
train_merge.first_affiliate_tracked.fillna('untracked',inplace = True)

test_merge.first_affiliate_tracked.fillna('untracked',inplace = True)

In [None]:
train_merge.shape

In [None]:
test_merge.shape

In [None]:
# make one hot encoded columns

lst_ohe_train =[]
ohe = ['gender', 'signup_method', 'language', 'affiliate_channel',\
            'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

for col in ohe:
    train_ohe = pd.get_dummies(train_merge[col],prefix = col)
    train_merge.drop([col], axis = 1, inplace = True)
    
    test_ohe = pd.get_dummies(test_merge[col], prefix = col)
    test_merge.drop([col], axis = 1, inplace = True)
    
    lst_ohe_train.append(train_ohe.columns)
    
    # Get missing columns in the training test
    
    missing_cols = set( train_ohe.columns ) - set( test_ohe.columns )
    
    # Add a missing column in test set with default value equal to 0
    
    for c in missing_cols:
        
        test_ohe[c] = 0
        
    # Ensure the order of column in the test set is in the same order than in train set
    
    test_ohe = test_ohe[train_ohe.columns]
    
    train_merge = pd.concat((train_merge, train_ohe), axis=1)    
        
    test_merge = pd.concat((test_merge, test_ohe), axis=1)

In [None]:
# OHE device_type

train_ohe = train_merge['device_type'].str.get_dummies(sep=",")
    
train_merge.drop(['device_type'], axis=1, inplace = True)
    
test_ohe = test_merge['device_type'].str.get_dummies(sep=",")
    
test_merge.drop(['device_type'], axis=1, inplace = True)
        
# Get missing columns in the training test
    
missing_cols = set( train_ohe.columns ) - set( test_ohe.columns )

lst_ohe_train.append(train_ohe.columns)
    
# Add a missing column in test set with default value equal to 0
    
for c in missing_cols:
        
    test_ohe[c] = 0
        
# Ensure the order of column in the test set is in the same order than in train set
    
test_ohe = test_ohe[train_ohe.columns]
    
train_merge = pd.concat((train_merge, train_ohe), axis=1)    
        
test_merge = pd.concat((test_merge, test_ohe), axis=1)

In [None]:
y = train_merge['country_destination']

In [None]:
# drop colmns

train_merge.drop(['id','date_account_created','timestamp_first_active',\
                  'date_first_booking','country_destination','user_id'],axis=1,inplace = True)

test_merge.drop(['id','date_account_created','timestamp_first_active',\
                  'date_first_booking','user_id'],axis=1,inplace = True)

In [None]:
print(train_merge.shape)
print(test_merge.shape)

In [None]:
print(train_merge.columns)

train_merge.head()

In [None]:
train_merge.isnull().sum()

In [None]:
test_merge.isnull().sum()

In [None]:
train_merge.age

## Tokenization and TF-IDF

In [None]:
def tokens(x):
    return x.split(',')

In [None]:
# TF-IDF for action

vectorizer_action = TfidfVectorizer(min_df= 10, max_features = 5000, tokenizer= tokens)
vectorizer_action.fit(train_merge["action"].values)

train_action_tfidf = vectorizer_action.transform(train_merge.action.values)
test_action_tfidf = vectorizer_action.transform(test_merge["action"].values)

print(train_action_tfidf.shape)
print(test_action_tfidf.shape)

In [None]:
# TF-IDF for action type

vectorizer_action_type =TfidfVectorizer(min_df = 10, max_features= 5000, tokenizer= tokens)
vectorizer_action_type.fit(train_merge["action_type"].values)

train_action_type_tfidf = vectorizer_action_type.transform(train_merge["action_type"].values)
test_action_type_tfidf = vectorizer_action_type.transform(test_merge["action_type"].values)

print(train_action_type_tfidf.shape)
print(test_action_type_tfidf.shape)


In [None]:
# TF-IDF for action type

vectorizer_action_detail =TfidfVectorizer(min_df = 10, max_features= 5000, tokenizer= tokens)
vectorizer_action_detail.fit(train_merge["action_detail"].values)

train_action_detail_tfidf = vectorizer_action_detail.transform(train_merge["action_detail"].values)
test_action_detail_tfidf = vectorizer_action_detail.transform(test_merge["action_detail"].values)

print(train_action_detail_tfidf.shape)
print(test_action_detail_tfidf.shape)


In [None]:
# drop colmns

train_merge.drop(['action','action_type','action_detail'],axis=1,inplace = True)

test_merge.drop(['action','action_type','action_detail'],axis=1,inplace = True)

In [None]:
col_lst = [i for i in train_merge.columns]

col_lst

### Data Stacking

In [None]:
train_merge_tfidf = hstack((train_merge, train_action_tfidf,train_action_type_tfidf, train_action_detail_tfidf)).tocsr()
test_merge_tfidf = hstack((test_merge, test_action_tfidf,test_action_type_tfidf, test_action_detail_tfidf)).tocsr()


In [None]:
test_merge_tfidf.get_shape()

# Training

In [None]:
le = LabelEncoder()

y = le.fit_transform(y)

In [None]:
# https://www.kaggle.com/davidgasquez/ndcg-scorer

def dcg_score(y_true, y_score, k=5):
    
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    
    lb = LabelBinarizer()
    lb.fit(range(predictions.shape[1] + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [None]:
# param_grid = {
# 'max_depth': sp_randint(3, 20),
# 'learning_rate': [0.001, 0.01, 0.1, 0.2],
# 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
# 'min_child_weight': [0.25,0.5, 1.0, 3.0, 5.0, 7.0],
# 'gamma': [0, 0.25, 0.3,0.35,0.45,0.5,0.6,0.8,1.0],
# 'reg_lambda': [0.1,0.2,0.4,0.5,0.6,0.8,1.0,10.0],
# 'n_estimators':[100,200,500,1000,2000],
# 'colsample_bytree':[0.1,0.3,0.5,1],
# 'colsample_bylevel':[0.1,0.3,0.5,1]
# }


# gb = xgb.XGBClassifier(objective='multi:softmax',eval_metric= 'mlogloss')

# clf = RandomizedSearchCV(gb, param_grid,n_jobs=-1,verbose=10,scoring=ndcg_scorer, random_state=42)

In [None]:
# clf.fit(train_merge_tfidf,y)

In [None]:
gb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='mlogloss', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, n_estimators=100, n_jobs=-1, objective='multi:softmax', random_state=0,
                       reg_lambda=1, subsample=1)

In [None]:
# X_train, X_test, y_train,y_test = train_test_split(train_merge_tfidf,y, test_size =0.25 , random_state = 42)

In [None]:
gb.fit(train_merge_tfidf,y)

In [None]:
pred_y = gb.predict_proba(test_merge_tfidf)


In [None]:
pred_y


In [None]:
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(pred_y[i])[::-1])[:5].tolist()

In [None]:
#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)

In [None]:
sub.head(20)