In [46]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
import matplotlib as plt
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, roc_auc_score, log_loss, brier_score_loss


In [18]:
df = pd.read_csv('oneday_df.csv')

In [19]:
sparse = ['hour', 'imp_content_position', 'imp_page_id', 'imp_ref_page_id', 
          'imp_ref_source', 'user_sex', 'user_age', 'content_category_id']
dense = ['user_following_cnt', 'user_bunpay_count', 'owner_grade', 'owner_item_count',
         'owner_interest', 'owner_follower_cnt', 'owner_bunpay_count', 'content_price',
        'content_emergency_cnt', 'content_comment_cnt', 'content_interest', 'content_pfavcnt']
features = sparse + dense

In [20]:
for s in sparse:
    df[s] = df[s].fillna('-1')
    df[s] = preprocessing.LabelEncoder().fit_transform(df[s])
for d in dense:
    df[d] = df[d].fillna(0)

df[dense] = preprocessing.MinMaxScaler(feature_range = (0, 1)).fit_transform(df[dense])

In [21]:
df = df[features + ['label']]

In [25]:
train, test = train_test_split(df, test_size = 0.2)
train, valid = train_test_split(train, test_size = 0.2)
train_data = lgb.Dataset(train[features], label=train['label'], feature_name = features, categorical_feature=sparse)
validation_data = lgb.Dataset(valid[features], label=valid['label'], feature_name = features, categorical_feature=sparse)
test_data = lgb.Dataset(test[features], label=test['label'], feature_name = features, categorical_feature=sparse)

In [26]:
param = {
    'objective': 'binary', 
    'metric': ['auc', 'binary_logloss', 'cross_entropy']
}

In [27]:
num_round = 10
bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])



[1]	valid_0's auc: 0.64885	valid_0's binary_logloss: 0.137312
[2]	valid_0's auc: 0.655093	valid_0's binary_logloss: 0.136474
[3]	valid_0's auc: 0.659406	valid_0's binary_logloss: 0.135845
[4]	valid_0's auc: 0.66185	valid_0's binary_logloss: 0.135341
[5]	valid_0's auc: 0.664085	valid_0's binary_logloss: 0.134906
[6]	valid_0's auc: 0.665591	valid_0's binary_logloss: 0.134557
[7]	valid_0's auc: 0.666216	valid_0's binary_logloss: 0.134262
[8]	valid_0's auc: 0.66741	valid_0's binary_logloss: 0.134013
[9]	valid_0's auc: 0.669117	valid_0's binary_logloss: 0.13379
[10]	valid_0's auc: 0.670158	valid_0's binary_logloss: 0.133607


In [36]:
pred = bst.predict(np.array(test[features]))

In [40]:
log_loss(test['label'], pred.tolist())

0.13502898991333825

In [45]:
accuracy_score(test['label'], pred.round())

0.9685331955533458

In [47]:
brier_score_loss(test['label'], pred.round())

0.031466804446654124

In [48]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(classification_report(test['label'], pred.round()))
print(confusion_matrix(test['label'], pred.round()))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98    596107
           1       0.00      0.00      0.00     19367

    accuracy                           0.97    615474
   macro avg       0.48      0.50      0.49    615474
weighted avg       0.94      0.97      0.95    615474

[[596107      0]
 [ 19367      0]]


## XGBoost

In [41]:
xgb = XGBClassifier(objective='binary:logistic',
                   n_jobs=-1,
                   random_state=42)
xgb.fit(train[features], train['label'])
xgb_pred = xgb.predict(test[features])
xgb_proba = xgb.predict_proba(test[features])
proba = []
for p in xgb_proba:
    proba.append(p[1])

In [43]:
print(log_loss(test['label'], proba))
print(accuracy_score(test['label'], xgb_pred))

0.13009098224921267
0.9685266964973338
