In [61]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
import matplotlib as plt
import matplotlib.pylab as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, roc_auc_score, log_loss, brier_score_loss


In [2]:
# 2020년 8월 20일 하루 데이터
df = pd.read_csv('oneday_df.csv')

In [3]:
df.shape

(3077366, 26)

In [4]:
df.columns

Index(['label', 'day', 'hour', 'imp_content_id', 'imp_content_position',
       'imp_page_id', 'imp_ref_page_id', 'imp_ref_term', 'imp_ref_source',
       'imp_user_id', 'imp_content_owner', 'user_sex', 'user_age',
       'user_following_cnt', 'user_bunpay_count', 'owner_grade',
       'owner_item_count', 'owner_interest', 'owner_follower_cnt',
       'owner_bunpay_count', 'content_price', 'content_category_id',
       'content_emergency_cnt', 'content_comment_cnt', 'content_interest',
       'content_pfavcnt'],
      dtype='object')

In [9]:
print('percentage of label 1 in total data:', round(df[df['label'] == 1].shape[0]/df.shape[0]*100, 2), '%')
# print('percentage of label 1 in train set:', )
# print('percentage of label 1 in test set:', )

percentage of label 1 in total data: 3.14 %


In [10]:
sparse = ['hour', 'imp_content_position', 'imp_page_id', 'imp_ref_page_id', 
          'imp_ref_source', 'user_sex', 'user_age', 'content_category_id']
dense = ['user_following_cnt', 'user_bunpay_count', 'owner_grade', 'owner_item_count',
         'owner_interest', 'owner_follower_cnt', 'owner_bunpay_count', 'content_price',
        'content_emergency_cnt', 'content_comment_cnt', 'content_interest', 'content_pfavcnt']
features = sparse + dense

In [13]:
data = df[features + ['label']]
for s in sparse:
    data[s] = data[s].fillna('-1')
    data[s] = preprocessing.LabelEncoder().fit_transform(data[s])
for d in dense:
    data[d] = data[d].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
print('total data size:', len(data))
print('size of data with label 1:', len(data[data['label'] == 1]))
print('size of data with label 0:', len(data[data['label'] == 0]))

total data size: 3077366
size of data with label 1: 96594
size of data with label 0: 2980772


In [26]:
click = data[data['label'] == 1]
non_click = data[data['label'] == 0]

In [58]:
norm_click = norm_data[norm_data['label'] == 1]
norm_non_click = norm_data[norm_data['label'] == 0]

### label 1 : label 0 = 1 : 1 로 샘플링

original data

In [27]:
train = pd.concat([click.sample(n=76800), non_click.sample(n=76800)])
test = pd.concat([click.sample(n=19200), non_click.sample(n=19200)])
x_train = train.iloc[:, :-1]
x_test = test.iloc[:, :-1]
y_train = train.iloc[:, -1]
y_test = test.iloc[:, -1]

print('proportion of label 1 in train set:', len([i for i in y_train.tolist() if i == 1])/len(y_train))
print('proportion of label 0 in test set:', len([i for i in y_test.tolist() if i == 1])/len(y_test))

proportion of label 1 in train set: 0.5
proportion of label 0 in test set: 0.5


In [28]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 0.5058072916666667
log loss: 14.077414993790377
log loss between label and click probability: 0.6802879074783329
accuracy: 0.592421875


normalized data

In [59]:
train = pd.concat([norm_click.sample(n=76800), norm_non_click.sample(n=76800)])
test = pd.concat([norm_click.sample(n=19200), norm_non_click.sample(n=19200)])
x_train = train.iloc[:, :-1]
x_test = test.iloc[:, :-1]
y_train = train.iloc[:, -1]
y_test = test.iloc[:, -1]

print('proportion of label 1 in train set:', len([i for i in y_train.tolist() if i == 1])/len(y_train))
print('proportion of label 0 in test set:', len([i for i in y_test.tolist() if i == 1])/len(y_test))

proportion of label 1 in train set: 0.5
proportion of label 0 in test set: 0.5


In [60]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 0.5646354166666666
log loss: 14.689969216007997
log loss between label and click probability: 0.677226462114652
accuracy: 0.5746875


### label 1 : label 0 = 1 : 2 로 샘플링

In [32]:
train = pd.concat([click.sample(n=76800), non_click.sample(n=153600)])
test = pd.concat([click.sample(n=19200), non_click.sample(n=38400)])
x_train = train.iloc[:, :-1]
x_test = test.iloc[:, :-1]
y_train = train.iloc[:, -1]
y_test = test.iloc[:, -1]

print('proportion of label 1 in train set:', len([i for i in y_train.tolist() if i == 1])/len(y_train))
print('proportion of label 0 in test set:', len([i for i in y_test.tolist() if i == 1])/len(y_test))

proportion of label 1 in train set: 0.3333333333333333
proportion of label 0 in test set: 0.3333333333333333


In [33]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 0.020885416666666667
log loss: 11.596283563039117
log loss between label and click probability: 0.6413575179559375
accuracy: 0.6642534722222222


### label 1 : label 0 = 1 : 4 로 샘플링

In [34]:
train = pd.concat([click.sample(n=76800), non_click.sample(n=307200)])
test = pd.concat([click.sample(n=19200), non_click.sample(n=76800)])
x_train = train.iloc[:, :-1]
x_test = test.iloc[:, :-1]
y_train = train.iloc[:, -1]
y_test = test.iloc[:, -1]

print('proportion of label 1 in train set:', len([i for i in y_train.tolist() if i == 1])/len(y_train))
print('proportion of label 0 in test set:', len([i for i in y_test.tolist() if i == 1])/len(y_test))

proportion of label 1 in train set: 0.2
proportion of label 0 in test set: 0.2


In [35]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 0.00109375
log loss: 6.925385087444144
log loss between label and click probability: 0.5386990550663094
accuracy: 0.7994895833333333


### label 1 : label 0 = 1 : 9 로 샘플링

In [36]:
train = pd.concat([click.sample(n=76800), non_click.sample(n=691200)])
test = pd.concat([click.sample(n=19200), non_click.sample(n=172800)])
x_train = train.iloc[:, :-1]
x_test = test.iloc[:, :-1]
y_train = train.iloc[:, -1]
y_test = test.iloc[:, -1]

print('proportion of label 1 in train set:', len([i for i in y_train.tolist() if i == 1])/len(y_train))
print('proportion of label 0 in test set:', len([i for i in y_test.tolist() if i == 1])/len(y_test))

proportion of label 1 in train set: 0.1
proportion of label 0 in test set: 0.1


In [37]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 0.00025
log loss: 3.4614331842331025
log loss between label and click probability: 0.38651071487477195
accuracy: 0.89978125


### label 1 : label 0 = 1 : 32 로 샘플링 -> CTR 약 3%

In [38]:
train = pd.concat([click.sample(n=76800), non_click.sample(n=2457600)])
test = pd.concat([click.sample(n=19200), non_click.sample(n=614400)])
x_train = train.iloc[:, :-1]
x_test = test.iloc[:, :-1]
y_train = train.iloc[:, -1]
y_test = test.iloc[:, -1]

print('proportion of label 1 in train set:', len([i for i in y_train.tolist() if i == 1])/len(y_train))
print('proportion of label 0 in test set:', len([i for i in y_test.tolist() if i == 1])/len(y_test))

proportion of label 1 in train set: 0.030303030303030304
proportion of label 0 in test set: 0.030303030303030304


In [39]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 6.470959595959596e-05
log loss: 1.0484285290221316
log loss between label and click probability: 0.18825616940269835
accuracy: 0.9696448863636363


In [40]:
coef = lr.coef_[0].tolist()
coef_dict = {}
for i in range(len(coef)):    
    coef_dict[x_train.columns[i]] = [coef[i]]
coeff = pd.DataFrame.from_dict(coef_dict).T
coeff.columns = ['coefficient']
absol = coeff['coefficient'].abs()
coeff['abs'] = absol
coeff.sort_values('abs', ascending=False)

Unnamed: 0,coefficient,abs
imp_content_position,-0.02050855,0.02050855
content_category_id,-0.01382919,0.01382919
user_age,-0.004274175,0.004274175
user_following_cnt,-0.003751461,0.003751461
hour,-0.002764533,0.002764533
content_pfavcnt,-0.002413813,0.002413813
owner_bunpay_count,-0.001559058,0.001559058
content_comment_cnt,0.0006977406,0.0006977406
owner_item_count,-0.000340734,0.000340734
user_sex,-0.0003314859,0.0003314859


### Simply split train:test = 8:2

In [46]:
x = data.iloc[:, :-1]
y = data.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [47]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 6.174103211508528e-05
log loss: 1.0904164817922686
log loss between label and click probability: 0.22365249442976656
accuracy: 0.968429210657152


In [48]:
coef = lr.coef_[0].tolist()
coef_dict = {}
for i in range(len(coef)):    
    coef_dict[x_train.columns[i]] = [coef[i]]
coeff = pd.DataFrame.from_dict(coef_dict).T
coeff.columns = ['coefficient']
absol = coeff['coefficient'].abs()
coeff['abs'] = absol
coeff.sort_values('abs', ascending=False)

Unnamed: 0,coefficient,abs
imp_content_position,-0.010143,0.010143
content_category_id,-0.006765,0.006765
user_age,-0.002084,0.002084
user_following_cnt,-0.00184,0.00184
hour,-0.001345,0.001345
content_pfavcnt,-0.001315,0.001315
owner_bunpay_count,-0.00079,0.00079
content_comment_cnt,0.000541,0.000541
owner_grade,-0.000346,0.000346
user_sex,-0.000161,0.000161


### Small dataset

In [49]:
x = data.iloc[:, :-1].sample(n=10000)
y = data.iloc[:, -1].sample(n=10000)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [50]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 0.002
log loss: 1.2779363258065564
log loss between label and click probability: 0.2728452929771814
accuracy: 0.963


In [51]:
coef = lr.coef_[0].tolist()
coef_dict = {}
for i in range(len(coef)):    
    coef_dict[x_train.columns[i]] = [coef[i]]
coeff = pd.DataFrame.from_dict(coef_dict).T
coeff.columns = ['coefficient']
absol = coeff['coefficient'].abs()
coeff['abs'] = absol
coeff.sort_values('abs', ascending=False)

Unnamed: 0,coefficient,abs
imp_content_position,-0.01409138,0.01409138
content_category_id,-0.01248629,0.01248629
user_age,-0.003686961,0.003686961
user_following_cnt,-0.003146883,0.003146883
hour,-0.002380446,0.002380446
owner_bunpay_count,-0.001700928,0.001700928
content_pfavcnt,-0.001588471,0.001588471
owner_item_count,-0.0007876637,0.0007876637
owner_grade,-0.0003151154,0.0003151154
user_sex,-0.0002918481,0.0002918481


In [52]:
print('label 1 proportion of train set:', len([i for i in y_train if i == 1])/len(y_train))
print('label 1 proportion of test set:', len([i for i in y_test if i == 1])/len(y_test))

label 1 proportion of train set: 0.03125
label 1 proportion of test set: 0.035


### Test with normalized data

In [63]:
from sklearn.preprocessing import MinMaxScaler

norm_data = data.copy()
norm_data[dense] = MinMaxScaler(feature_range = (0, 1)).fit_transform(norm_data[dense])


In [64]:
x = norm_data.iloc[:, :-1]
y = norm_data.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [65]:
lr = LogisticRegression(class_weight=None, n_jobs=-1, random_state=42)
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
proba = lr.predict_proba(x_test)
click_proba = []
for i in proba:
    click_proba.append(i[1])
print('label 1 proportion of prediction:', len([i for i in pred if i == 1])/len(pred))
print('log loss:', log_loss(y_test.astype(int), pred.astype(int)))
print('log loss between label and click probability:', log_loss(y_test, click_proba))
print('accuracy:', accuracy_score(y_test.astype(int), pred.astype(int)))

label 1 proportion of prediction: 0.0
log loss: 1.0837384678126372
log loss between label and click probability: 0.13893841774917534
accuracy: 0.9686225575735125


In [56]:
coef = lr.coef_[0].tolist()
coef_dict = {}
for i in range(len(coef)):    
    coef_dict[x_train.columns[i]] = [coef[i]]
coeff = pd.DataFrame.from_dict(coef_dict).T
coeff.columns = ['coefficient']
absol = coeff['coefficient'].abs()
coeff['abs'] = absol
coeff.sort_values('abs', ascending=False)

Unnamed: 0,coefficient,abs
imp_ref_page_id,-0.557074,0.557074
content_price,0.275182,0.275182
imp_ref_source,-0.240636,0.240636
owner_item_count,-0.161769,0.161769
user_sex,0.131454,0.131454
content_comment_cnt,-0.085208,0.085208
content_interest,-0.054906,0.054906
content_emergency_cnt,-0.052654,0.052654
imp_page_id,-0.04517,0.04517
owner_interest,-0.0387,0.0387


In [67]:
print('auc:', roc_auc_score(y_test.astype(int), pred.astype(int)))

auc: 0.5
