In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
test_users = pd.read_csv("test_users.csv")
train_users = pd.read_csv("train_users_2.csv")
all_users = pd.concat((train_users, test_users), axis = 0, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


### 填補缺失值修改類別特徵

In [3]:
all_users.isnull().sum()

affiliate_channel               0
affiliate_provider              0
age                        116866
country_destination         62096
date_account_created            0
date_first_booking         186639
first_affiliate_tracked      6085
first_browser                   0
first_device_type               0
gender                          0
id                              0
language                        0
signup_app                      0
signup_flow                     0
signup_method                   0
timestamp_first_active          0
dtype: int64

In [4]:
#刪除目標變數country_destination和過多缺失值的date_first_booking和Age
all_users = all_users.drop(['date_first_booking','country_destination','age'],axis=1)
#first_affiliate_tracked的untracked有143181

In [5]:
#查看每個類別當中的unknown
for i in all_users.columns:
    print(all_users[i].value_counts(dropna=False))
    print('======================================')
#unknown:gender有129480、first_browser有44394、lanuage:1

direct           181571
sem-brand         36439
sem-non-brand     20075
seo               14362
other              9547
api                8167
content            4118
remarketing        1268
Name: affiliate_channel, dtype: int64
direct                 181270
google                  65956
other                   13036
facebook                 3996
bing                     3719
craigslist               3475
padmapper                 836
vast                      830
yahoo                     653
facebook-open-graph       566
gsp                       455
meetup                    358
email-marketing           270
naver                      66
baidu                      32
yandex                     18
wayn                        8
daum                        3
Name: affiliate_provider, dtype: int64
2014-07-23    1105
2014-07-22    1052
2014-07-17     978
2014-07-24     923
2014-07-18     892
2014-07-21     888
2014-08-27     872
2014-08-26     841
2014-07-29     816
2014-08-06     790
2

In [6]:
#刪除在EDA已修改過的column
all_users['date_account_created'] = pd.to_datetime(all_users['date_account_created'])
all_users['dac_year'] = all_users.date_account_created.dt.year
all_users['dac_month'] = all_users.date_account_created.dt.month
all_users['dac_day'] = all_users.date_account_created.dt.day
all_users['date_first_active'] = pd.to_datetime(all_users.timestamp_first_active // 1000000, format = '%Y%m%d')
all_users['dfa_year'] = all_users.date_first_active.dt.year
all_users['dfa_month'] = all_users.date_first_active.dt.month
all_users['dfa_day'] = all_users.date_first_active.dt.day
all_users = all_users.drop(['timestamp_first_active','date_account_created','id','date_first_active'],axis=1)

In [7]:
#all_users.head()

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [9]:
all_users.affiliate_channel = le.fit_transform(all_users.affiliate_channel)
all_users.affiliate_provider = le.fit_transform(all_users.affiliate_provider)
all_users.first_device_type = le.fit_transform(all_users.first_device_type)
all_users.first_browser = le.fit_transform(all_users.first_browser)
all_users.signup_app = le.fit_transform(all_users.signup_app)
all_users.signup_method = le.fit_transform(all_users.signup_method)
all_users.gender = le.fit_transform(all_users.gender)

In [10]:
#all_users.head()

In [11]:
all_users.loc[all_users.language == '-unknown-', 'language'] = all_users.language.mode()[0]
all_users.language = le.fit_transform(all_users.language)

In [12]:
all_users.loc[all_users.first_affiliate_tracked.isnull(), 'first_affiliate_tracked'] = 'untracked'
all_users.first_affiliate_tracked = le.fit_transform(all_users.first_affiliate_tracked)

In [13]:
#all_users.head()

In [14]:
X_train = all_users[:train_users.shape[0]]#train的feature
X_test = all_users[train_users.shape[0]:]#上傳kaggle的
y_train = train_users.country_destination#train的target
y_train = le.fit_transform(y_train)
X_train.shape,X_test.shape,y_train.shape

((213451, 16), (62096, 16), (213451,))

In [15]:
from sklearn.model_selection import train_test_split

[train_data, test_data, train_target, test_target] = train_test_split(X_train, y_train, test_size = 0.3, random_state = 87)

In [16]:
#用PCA取出較重要的類別
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
train_data_pca = pca.fit_transform(train_data)
test_data_pca = pca.fit_transform(test_data)

In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=100.0, random_state=87)
lr.fit(train_data_pca, train_target)
print('train_score:%.3f'%(lr.score(train_data_pca, train_target)))
print('test_score:%.3f'%(lr.score(test_data_pca, test_target)))



train_score:0.583
test_score:0.585


In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_data_pca, train_target)
print('train_score:%.3f'%(gnb.score(train_data_pca, train_target)))
print('test_score:%.3f'%(gnb.score(test_data_pca, test_target)))

train_score:0.583
test_score:0.585


In [19]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(max_depth = 4, n_estimators = 100, random_state = 87)
#gb.fit(train_data, train_target)
#print('train_score:%.5f'%(gb.score(train_data, train_target)))
#print('test_score:%.5f'%(gb.score(test_data, test_target)))
gb.fit(train_data_pca, train_target)
print('train_score:%.3f'%(gb.score(train_data_pca, train_target)))
print('test_score:%.3f'%(gb.score(test_data_pca, test_target)))

train_score:0.594
test_score:0.580


In [21]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=22,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=87)               
xgb.fit(train_data_pca, train_target)
print('train_score:%.3f'%(xgb.score(train_data_pca, train_target)))
print('test_score:%.3f'%(xgb.score(test_data_pca, test_target)))

train_score:0.586
test_score:0.586


發現xgboost的test score分數叫好解沒有overfitting現象，決定使用

In [30]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_test = pca.fit_transform(X_test)
X_train = pca.fit_transform(X_train)

In [31]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.6, gamma=0,
       learning_rate=0.3, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=22, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=87, silent=None,
       subsample=0.6, verbosity=1)

In [32]:
XBG_pred_proba = xgb.predict_proba(X_test)

In [35]:
test_ids = test_users['id']
ids = []  
cts = []  
for i in range(len(test_ids)):
    idx = test_ids[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(XBG_pred_proba[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('pca_submission.csv',index=False)