In [1]:
import re
import gc
from datetime import datetime

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from IPython.core.display import Image 
from IPython.display import display
from scipy.stats import mode
import scipy.stats as stats

from sklearn.tree import export_graphviz
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder
import joblib
from sklearn import metrics
from sklearn.metrics import make_scorer

import lightgbm as lgb
from lightgbm import plot_importance
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn import linear_model

In [2]:
# Load the data into DataFrames
df_train = pd.read_csv('/content/drive/MyDrive/台新/train_users_2.csv')
df_test = pd.read_csv('/content/drive/MyDrive/台新/test_users.csv')
df_sessions = pd.read_csv('/content/drive/MyDrive/台新/sessions.csv')
df_age_bkts = pd.read_csv('/content/drive/MyDrive/台新/age_gender_bkts.csv')
df_countries = pd.read_csv('/content/drive/MyDrive/台新/countries.csv')

In [3]:
df_all = pd.concat([df_train, df_test], axis=0)
train_id = df_train.id
test_id = df_test.id
all_id = df_all.id

In [4]:
#date account create
df_all["date_account_created"] = pd.to_datetime(df_all["date_account_created"], format = "%Y-%m-%d")

#timestamp first active
df_all["timestamp_first_active"] = pd.to_datetime(df_all["timestamp_first_active"], format="%Y%m%d%H%M%S")

df_all['create_year'] = df_all["date_account_created"].apply(lambda x : x.year)
df_all['create_month'] = df_all["date_account_created"].apply(lambda x : x.month)
df_all['create_day'] = df_all["date_account_created"].apply(lambda x : x.day)

df_all['active_year'] = df_all["timestamp_first_active"].apply(lambda x : x.year)
df_all['active_month'] = df_all["timestamp_first_active"].apply(lambda x : x.month)
df_all['active_day'] = df_all["timestamp_first_active"].apply(lambda x : x.day)

In [5]:
lagging = df_all["timestamp_first_active"] - df_all["date_account_created"]

#lagging time days, log seconds
df_all["lag_days"] = lagging.apply(lambda x : -1 * x.days)
df_all["lag_seconds"] = np.log(lagging.apply(lambda x : x.seconds))

In [6]:
def get_holidays(year):
    response = requests.get("https://www.timeanddate.com/calendar/custom.html?year="+str(year)+"                                &country=1&cols=3&df=1&hol=25")
    dom = BeautifulSoup(response.content, "html.parser")

    trs = dom.select("table.cht.lpad tr")

    df = pd.DataFrame(columns=["date", "holiday"])
    for tr in trs:
        datestr = tr.select_one("td:nth-of-type(1)").text
        date = datetime.strptime("{} {}".format(year, datestr), '%Y %b %d')
        holiday = tr.select_one("td:nth-of-type(2)").text
        df.loc[len(df)] = {"date" : date, "holiday": 1}
    return df

holiday_ls = []
for year in range(2009, 2015):
    df = get_holidays(year)
    holiday_ls.append(df)
    holiday_df = pd.concat(holiday_ls)

In [7]:
select_date = list(holiday_df["date"].astype("str"))
holiday = df_all.timestamp_first_active.apply(lambda x : str(x.date())).isin(select_date)

df_all["holiday"] = holiday
df_all['holiday'] = 1 * (df_all.holiday == True)

In [8]:
weekday = df_all.filter(items=['id','timestamp_first_active'])
weekday = pd.to_datetime(weekday["timestamp_first_active"], format="%Y-%m-%d")
weekday = weekday.dt.dayofweek

df_all["weekend"] = weekday.apply(lambda x : 1 if x>=5 else 0)

In [9]:
df_all = df_all.drop("date_account_created" , axis=1)
df_all = df_all.drop("timestamp_first_active" , axis=1)

In [10]:
checklist = (df_all['age'] < 120) & (df_all['gender'] != '-unknown-')

df_all['faithless_sign'] = checklist.apply(lambda x : 0 if x == True else 1)

In [11]:
df_all = df_all.drop("date_first_booking", axis=1)

In [12]:
df_all.first_affiliate_tracked.mode()
df_all["first_affiliate_tracked"] = df_all["first_affiliate_tracked"].replace(np.nan, "untracked")

In [13]:
df_age = df_all.filter(items = ['age', 'country_destination','id', 'gender'])
df_dummy = df_all.filter(items = ['affiliate_channel', 'affiliate_provider',
                  'first_affiliate_tracked', 'first_browser', 'first_device_type',
                  'language', 'signup_app', 'signup_flow', 'signup_method', 
                  'create_year', 'create_month', 'create_day', 
                  'active_year', 'active_month', 'active_day', 'lag_days', 'lag_seconds', 
                  'holiday', 'weekend'])
    
df_dummy = pd.get_dummies(df_dummy)
df_all = pd.concat([df_age, df_dummy], axis=1)

In [14]:
#divide train / test by null age data
age_train = df_all[df_all["age"].notnull()].reset_index(drop=True)
age_test = df_all[df_all["age"].isnull()].reset_index(drop=True)

#divide 5 cluster age data
bins = [0, 15, 25, 35, 60, 9999]
labels = ["underage", "tweenty", "thirty", "mid_old", "old"]
cats = pd.cut(age_train['age'], bins, labels=labels)
cats = pd.DataFrame(cats)

age_train_id = age_train.id
age_test_id = age_test.id

age_train = age_train.drop(['id', 'age', 'country_destination', 'gender'], axis=1)
age_test = age_test.drop(['id', 'age', 'country_destination', 'gender'], axis=1)

In [15]:
X = age_train
y = cats

#model recall rate is so low, but it gives better cross validation score for final prediction model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
    
model_age = lgb.LGBMClassifier(boosting_type='gbdt', n_jobs=-1, reg_alpha=0.5, reg_lambda=0.5).fit(X_train, y_train)
pred_age = model_age.predict(X_test)

print(classification_report(y_test, pred_age))

              precision    recall  f1-score   support

     mid_old       0.48      0.38      0.42     11150
         old       0.46      0.00      0.01      2044
      thirty       0.49      0.78      0.60     14036
     tweenty       0.47      0.04      0.07      4490
    underage       0.56      0.29      0.38        17

    accuracy                           0.49     31737
   macro avg       0.49      0.30      0.30     31737
weighted avg       0.48      0.49      0.43     31737



In [16]:
#prediction age
pred_age = model_age.predict(age_test)
pred_age = pd.DataFrame(pred_age, columns=['age'])
pred_age = pd.concat([pred_age, age_test_id], axis=1)
pred_age["age"] = pred_age["age"].replace({'underage':15, "tweenty" : 25, "thirty" : 35, 'mid_old' : 45, 'old' : 60})

#original age
origin_age = y
origin_age = pd.DataFrame(origin_age, columns=['age'])
origin_age = pd.concat([origin_age, age_train_id], axis=1)
origin_age["age"] = origin_age["age"].replace({'underage':15, "tweenty" : 25, "thirty" : 35, 'mid_old' : 45, 'old' : 60})

#concat original age and prediction age
age = pd.concat([origin_age, pred_age], axis=0)
print('age lenght check :', len(age))
age.head()

age lenght check : 275547


Unnamed: 0,age,id
0,45,820tgsjxq7
1,45,4ft3gnwmtx
2,45,bjjt8pjhuk
3,45,87mebub9p4
4,45,lsw9q7uk0j


In [17]:
df_all = df_all.drop("age" , axis=1)

df_all = pd.merge(df_all, age, on="id", how="left")

In [18]:
df_all["gender"] = df_all["gender"].replace(['-unknown-', 'OTHER'], np.nan)

gender_train = df_all[df_all["gender"].notnull()].reset_index()
gender_test = df_all[df_all["gender"].isnull()].reset_index()

y = gender_train.gender

gender_train_id = gender_train.id
gender_test_id = gender_test.id

gender_train = gender_train.drop(['id', 'age', 'country_destination', 'gender'], axis=1)
gender_test = gender_test.drop(['id', 'age', 'country_destination', 'gender'], axis=1)

X = gender_train

In [19]:
#model recall rate is so low, but it gives better cross validation score for final prediction model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# X_train, X_test, y_train = gender_train, gender_test, y
    
model_age = lgb.LGBMClassifier(n_estimators=500, n_jobs=-1, reg_alpha=1).fit(X_train, y_train)
pred_age = model_age.predict(X_test)

print(classification_report(y_test, pred_age))

              precision    recall  f1-score   support

      FEMALE       0.57      0.67      0.62     15550
        MALE       0.53      0.43      0.48     13597

    accuracy                           0.56     29147
   macro avg       0.55      0.55      0.55     29147
weighted avg       0.56      0.56      0.55     29147



In [20]:
pred_gender = model_age.predict(gender_test)
pred_gender = pd.DataFrame(pred_gender)

#prediction age
pred_gender = model_age.predict(gender_test)
pred_gender = pd.DataFrame(pred_gender, columns=['gender'])
pred_gender = pd.concat([pred_gender, gender_test_id], axis=1)

#original age
origin_gender = y
origin_gender = pd.DataFrame(origin_gender, columns=['gender'])
origin_gender = pd.concat([origin_gender, gender_train_id], axis=1)

#concat original age and prediction age
gender = pd.concat([origin_gender, pred_gender], axis=0)
print('gender lenght check :', len(gender))
gender.head()

gender lenght check : 275547


Unnamed: 0,gender,id
0,MALE,820tgsjxq7
1,FEMALE,4ft3gnwmtx
2,FEMALE,bjjt8pjhuk
3,FEMALE,lsw9q7uk0j
4,FEMALE,0d01nltbrs


In [21]:
df_all = df_all.drop("gender" , axis=1)
df_all = pd.merge(df_all, gender, on="id", how="left")

In [22]:
#split
new_df_train = df_all[:213451]
new_df_test = df_all[213451:]

**session data preprocessing**

In [23]:
df_sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [24]:
#Checking null values
df_sessions.isnull().sum()

user_id            34496
action             79626
action_type      1126204
action_detail    1126204
device_type            0
secs_elapsed      136031
dtype: int64

In [25]:
session_df_concat = df_sessions.groupby('user_id', as_index=False).agg(lambda x: x.tolist())

print(session_df_concat.shape)

session_df_concat.head()

(135483, 6)


Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,00023iyk9l,"[index, dashboard, header_userpic, dashboard, ...","[view, view, data, view, partner_callback, mes...","[view_search_results, dashboard, header_userpi...","[Mac Desktop, Mac Desktop, Mac Desktop, Mac De...","[20438.0, 787.0, 850.0, 934.0, nan, 129817.0, ..."
1,0010k6l0om,"[search_results, show, personalize, show, sear...","[click, view, data, nan, click, click, nan, da...","[view_search_results, p3, wishlist_content_upd...","[Mac Desktop, Mac Desktop, Mac Desktop, Mac De...","[1708.0, 21260.0, 1223.0, 26.0, 847.0, 1230.0,..."
2,001wyh0pz8,"[search, search, search, show, social_connecti...","[click, click, click, view, data, -unknown-, v...","[view_search_results, view_search_results, vie...","[Android App Unknown Phone/Tablet, Android App...","[622.0, 1813.0, 1507.0, 6327.0, 927.0, 142.0, ..."
3,0028jgx1x1,"[show, reviews, show, search, show, search, re...","[view, data, view, click, view, click, data, s...","[user_profile, listing_reviews, p3, view_searc...","[-unknown-, -unknown-, -unknown-, -unknown-, -...","[6162.0, 75.0, 86.0, 13710.0, 25217.0, 10989.0..."
4,002qnbzfs5,"[social_connections, payment_methods, create, ...","[data, -unknown-, -unknown-, view, data, data,...","[user_social_connections, -unknown-, -unknown-...","[iPhone, iPhone, iPhone, iPhone, iPhone, iPhon...","[17135.0, 711.0, 274.0, 179.0, 483.0, 1.0, 782..."


In [26]:
def list_to_string(lst):
    lst = [re.sub('nan', '', str(x)) for x in lst]
    return ','.join(lst)

def secs_sum(lst):
    lst = [re.sub('nan', '', str(x)) for x in lst]
    return sum([float(x) for x in lst if x != ''])

In [27]:
session_df_concat['action'] = session_df_concat['action'].apply(list_to_string)
session_df_concat['action_type'] = session_df_concat['action_type'].apply(list_to_string)
session_df_concat['action_detail'] = session_df_concat['action_detail'].apply(list_to_string)
session_df_concat['device_type'] = session_df_concat['device_type'].apply(list_to_string)
session_df_concat['secs_elapsed'] = session_df_concat['secs_elapsed'].apply(secs_sum)

In [28]:
train_merge = new_df_train.merge(session_df_concat, left_on='id', right_on='user_id', how='inner')
print("Train  :",new_df_train.shape)
print("Session:",session_df_concat.shape)
print("Merge  :",train_merge.shape)
print("No of users in Train Data with session info:",train_merge.shape[0])
print("{} / {} = {}".format(train_merge.shape[0],new_df_train.shape[0],np.round((train_merge.shape[0]/new_df_train.shape[0]),2)))

Train  : (213451, 146)
Session: (135483, 6)
Merge  : (73815, 152)
No of users in Train Data with session info: 73815
73815 / 213451 = 0.35


In [29]:
test_merge = new_df_test.merge(session_df_concat, left_on='id', right_on='user_id', how='left')
print("Test  :",new_df_test.shape)
print("Session:",session_df_concat.shape)
print("Merge  :",test_merge.shape)
print("No of users in Test Data with session info:",test_merge.shape[0])
print("{} / {} = {}".format(test_merge.shape[0],new_df_test.shape[0],np.round((test_merge.shape[0]/new_df_test.shape[0]),2)))

Test  : (62096, 146)
Session: (135483, 6)
Merge  : (62096, 152)
No of users in Test Data with session info: 62096
62096 / 62096 = 1.0


In [30]:
test_merge = test_merge.drop(['country_destination'],axis=1)
test_merge['user_id'].fillna('na' , inplace=True)
test_merge['action'].fillna('na' , inplace=True)
test_merge['action_type'].fillna('na' , inplace=True)
test_merge['action_detail'].fillna('na' , inplace=True)
test_merge['device_type'].fillna('na' , inplace=True)
test_merge['secs_elapsed'].fillna(0, inplace=True)

In [31]:
#labels
y = train_merge['country_destination']

train_merge.drop(['id','country_destination','user_id'],axis=1,inplace = True)
test_merge.drop(['id','user_id'],axis=1,inplace = True)

In [32]:
#OHE gender
one_hot_df = pd.get_dummies(test_merge['gender'], prefix='gender')
test_merge = test_merge.drop('gender', axis=1)
test_merge = pd.concat([test_merge, one_hot_df], axis=1)

# 將 gender 欄位 one-hot encoding
one_hot_df = pd.get_dummies(train_merge['gender'], prefix='gender')
train_merge = train_merge.drop('gender', axis=1)
train_merge = pd.concat([train_merge, one_hot_df], axis=1)


**TF-IDF for session data**

In [33]:
def tokens(x):        
    return x.split(',')

def TF_IDF(train, test, col):
  vectorizer = TfidfVectorizer(min_df=10,max_features=5000,tokenizer=tokens)
  vectorizer.fit(train[col].values)
  train_merge_tfidf = vectorizer.transform(train[col].values)
  test_merge_tfidf = vectorizer.transform(test[col].values)
  print("After vectorizations")
  print(train_merge_tfidf.shape)
  print(test_merge_tfidf.shape)
  print("="*100)

  return train_merge_tfidf, test_merge_tfidf

In [35]:
action_tfidf_train, action_tfidf_test = TF_IDF(train_merge, test_merge, 'action')
action_ty_tfidf_train, action_ty_tfidf_test = TF_IDF(train_merge, test_merge, 'action_type')
action_de_tfidf_train, action_de_tfidf_test = TF_IDF(train_merge, test_merge, 'action_detail')
device_tfidf_train, device_tfidf_test = TF_IDF(train_merge, test_merge, 'device_type')

After vectorizations
(73815, 256)
(62096, 256)
After vectorizations
(73815, 9)
(62096, 9)
After vectorizations
(73815, 122)
(62096, 122)
After vectorizations
(73815, 13)
(62096, 13)


In [36]:
# drop colmns
train_merge.drop(['action','action_type','action_detail','device_type'],axis=1,inplace = True)
test_merge.drop(['action','action_type','action_detail','device_type'],axis=1,inplace = True)

In [37]:
# data stacking
train_merge_tfidf = hstack((train_merge.astype(float), action_tfidf_train, action_ty_tfidf_train, action_de_tfidf_train, device_tfidf_train)).tocsr()
test_merge_tfidf = hstack((test_merge.astype(float), action_tfidf_test, action_ty_tfidf_test, action_de_tfidf_test, device_tfidf_test)).tocsr()

print("Final Data matrix")
print(train_merge_tfidf.shape)
print(test_merge_tfidf.shape)
print("="*100)

Final Data matrix
(73815, 546)
(62096, 546)


In [38]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([11,  7,  7, ...,  7,  7,  7])

In [39]:
def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(ground_truth, predictions, k=5):
    lb = LabelBinarizer()
    lb.fit(range(predictions.shape[1] + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

**modeling**

* 5-fold
* 比較時間與 ndcg

In [152]:
%%time
model = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
print(np.mean(cross_val_score(model, train_merge_tfidf, y, n_jobs=-1,scoring=ndcg_scorer)))

0.8135307249111973
CPU times: user 180 ms, sys: 2.19 s, total: 2.37 s
Wall time: 15.1 s


In [153]:
%%time
model = RandomForestClassifier()
print(np.mean(cross_val_score(model, train_merge_tfidf, y, n_jobs=-1,scoring=ndcg_scorer)))

0.8370888837330324
CPU times: user 1.36 s, sys: 200 ms, total: 1.56 s
Wall time: 4min 13s


In [154]:
%%time
model = xgb.XGBClassifier(objective='multi:softmax',eval_metric= 'mlogloss')
print(np.mean(cross_val_score(model, train_merge_tfidf, y, n_jobs=-1,scoring=ndcg_scorer)))

0.7945144561750981
CPU times: user 4.02 s, sys: 2.21 s, total: 6.22 s
Wall time: 13min 45s


In [155]:
%%time
model = lgb.LGBMClassifier(boosting_type= 'gbdt',nthread=3, n_jobs=-1, reg_alpha=1, reg_lambda=0, max_depth=-1, learning_rate=0.05, n_estimators=400)
print(np.mean(cross_val_score(model, train_merge_tfidf, y, n_jobs=-1,scoring=ndcg_scorer)))

0.7686071396726507
CPU times: user 871 ms, sys: 1.49 s, total: 2.36 s
Wall time: 2min 32s


**summit**

In [None]:
test_id = df_test['id'].values

In [41]:
def submit(train, test, target):    
    
    model = lgb.LGBMClassifier(boosting_type= 'gbdt',nthread=3, n_jobs=-1, reg_alpha=1, reg_lambda=0, max_depth=-1, learning_rate=0.05, n_estimators=400)
    print("model fitting starting ...")
    model.fit(train, target)
       
    print("model fitting completed ...")
    print()
    
    predic_proba = model.predict_proba(test)
    
    ids = []
    countries = []

    # Taking the 5 classes with highest probabilities

    for i in range(len(test_id)):
        idx = test_id[i]
        ids += [idx] * 5
        countries += le.inverse_transform(np.argsort(predic_proba[i])[::-1][:5]).tolist()
        
    # Generate submission

    sub = pd.DataFrame({"id" : ids,"country" : countries})

    sub.to_csv('/content/drive/MyDrive/台新/submission.csv', index = False)
    print("kaggle submission in process ...")
    
    gc.collect()

In [42]:
submit(train_merge_tfidf, test_merge_tfidf, y)

model fitting starting ...
model fitting completed ...

kaggle submission in process ...
