In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from IPython.display import display, HTML, clear_output

from sklearn import preprocessing, cross_validation
from sklearn import linear_model, ensemble, naive_bayes
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score as f1

pd.set_option('display.max_columns', None)

%matplotlib inline

In [2]:
df_1 = pd.read_csv('input/training_dataset.csv')
df_2 = pd.read_csv('input/test_dataset.csv')

df_1_copy = df_1.copy()

Remove irrelevant fields, merge training and testing set into one dataset (for easier conversion of categorical variables).

In [3]:
Y = df_1['opened']
df_1.drop(['click_time', 'clicked', 'open_time', 'unsubscribe_time', 'unsubscribed', 'opened'], axis=1, inplace=True)

df = pd.concat([df_1.copy(), df_2.copy()]).reset_index(drop=True)

`mail_id` and `user_id` are encoded as strings and anonimized. To save space and in order to navigate through them easier I convert them to integers.

In [4]:
user_ids = df['user_id'].unique().tolist()
mail_ids = df['mail_id'].unique().tolist()

user_map = {v: k for k, v in enumerate(user_ids)}
mail_map = {v: k for k, v in enumerate(mail_ids)}

df['user_id'] = df['user_id'].apply(lambda x: user_map[x])
df['mail_id'] = df['mail_id'].apply(lambda x: mail_map[x])

There are a couple of users which appears frequently in the dataset. They received many emails and thus either opened or ignored them. I extracted open rate for a user.

In [5]:
gb = df_1_copy[['user_id', 'opened']].groupby('user_id')
user_info = (pd.concat([gb.sum() / gb.count(), gb.count()], axis=1)).reset_index()
user_info.columns = ['user_id', 'opened_rate', 'all_num']
user_info['user_id'] = user_info['user_id'].apply(lambda x: user_map[x])

A few fields had timestamp information. Convert them to timestamp (should have done it while reading). These timestamps are not really helpful, so I extracted information about user's age in the system at the time of sending and from the last time he has been seen. Both information is in days.

Also converted mail_type category to integer

In [6]:
timestamp_fields = ['sent_time', 'last_online', 'hacker_created_at']
for el in timestamp_fields:
    df[el] = df[el].astype('datetime64[s]')

df['age_sent'] = (df['sent_time'] - df['hacker_created_at']) / np.timedelta64(1, 'D')
df['last_seen'] = (df['sent_time'] - df['last_online']) / np.timedelta64(1, 'D')

df.drop(['mail_type'] + timestamp_fields, axis=1, inplace=True)

df['mail_category'] = pd.to_numeric(df['mail_category'].str.split('y_').str.get(-1))

Renamed a couple of fields and reordered them.

In [7]:
df.rename(columns={
    'contest_login_count'          : 'clc_all',
    'contest_login_count_1_days'   : 'clc_1',
    'contest_login_count_7_days'   : 'clc_7',
    'contest_login_count_30_days'  : 'clc_30',
    'contest_login_count_365_days' : 'clc_365',
    
    'contest_participation_count'           : 'cpc_all',
    'contest_participation_count_1_days'    : 'cpc_1',
    'contest_participation_count_7_days'    : 'cpc_7',
    'contest_participation_count_30_days'   : 'cpc_30',
    'contest_participation_count_365_days'  : 'cpc_365',
    
    'submissions_count'         : 'subm_all',
    'submissions_count_1_days'  : 'subm_1',
    'submissions_count_7_days'  : 'subm_7',
    'submissions_count_30_days' : 'subm_30',
    'submissions_count_365_days': 'subm_365',
    
    'submissions_count_contest'         : 'subm_c_all',
    'submissions_count_contest_1_days'  : 'subm_c_1',
    'submissions_count_contest_7_days'  : 'subm_c_7',
    'submissions_count_contest_30_days' : 'subm_c_30',
    'submissions_count_contest_365_days': 'subm_c_365',
    
    'submissions_count_master'         : 'subm_m_all',
    'submissions_count_master_1_days'  : 'subm_m_1',
    'submissions_count_master_7_days'  : 'subm_m_7',
    'submissions_count_master_30_days' : 'subm_m_30',
    'submissions_count_master_365_days': 'subm_m_365',
    
    'ipn_count'         : 'ipn_all',
    'ipn_count_1_days'  : 'ipn_1',
    'ipn_count_7_days'  : 'ipn_7',
    'ipn_count_30_days' : 'ipn_30',
    'ipn_count_365_days': 'ipn_365',
    
    'ipn_read'         : 'ipnr_all',
    'ipn_read_1_days'  : 'ipnr_1',
    'ipn_read_7_days'  : 'ipnr_7',
    'ipn_read_30_days' : 'ipnr_30',
    'ipn_read_365_days': 'ipnr_365',
    
    'forum_comments_count' : 'forum_reply',
    'forum_count'          : 'forum_cnt',
    'forum_expert_count'   : 'forum_exp',
    'forum_questions_count': 'forum_quest',
    'hacker_confirmation'  : 'confirmed',
    'hacker_timezone'      : 'timezone',
    'mail_category'        : 'mail_cat'
}, inplace=True)

df = df[[
    'user_id', 'age_sent', 'last_seen', 'mail_id', 'mail_cat', 'timezone', 'confirmed',
    
    'forum_reply', 'forum_cnt', 'forum_exp', 'forum_quest',
    
    'clc_all', 'clc_1', 'clc_7', 'clc_30', 'clc_365',
    'cpc_all', 'cpc_1', 'cpc_7', 'cpc_30', 'cpc_365',
    'subm_all', 'subm_1', 'subm_7', 'subm_30', 'subm_365',
    'subm_c_all', 'subm_c_1', 'subm_c_7', 'subm_c_30', 'subm_c_365',
    'subm_m_all', 'subm_m_1', 'subm_m_7', 'subm_m_30', 'subm_m_365',
    'ipn_all', 'ipn_1', 'ipn_7', 'ipn_30', 'ipn_365',
    'ipnr_all', 'ipnr_1', 'ipnr_7', 'ipnr_30', 'ipnr_365',
]]

The cardinality of the categorical variables was too high, so I tried to trim it. I also imputed NaN values

In [8]:
def truncate_categorical(field, num):
    tmp = df[field].value_counts()
    vals = set(tmp[tmp < num].index.values)

    df[field][df[field].isin(vals)] = -1
    return vals

In [9]:
truncate_categorical('mail_id', 200);
truncate_categorical('timezone', 400);

In [10]:
df['mail_cat'].fillna(18, inplace=True) # the least popular category
df['timezone'].fillna(-1, inplace=True) # group of least popular timezones
df['last_seen'].fillna(df['last_seen'].mean(), inplace=True) # mean value

Finally adding open rate which I calculated almost in the beginning of the notebook. Also calculate percentage of notification rates over some period.

In [11]:
df = df.reset_index().merge(user_info, on='user_id', how = 'left').sort_values('index').drop(['index'], 1)
df['opened_rate'].fillna(0.5, inplace=True)
df['has_opened_rate'] = ~df['all_num'].isnull()
df['all_num'].fillna(0, inplace=True)

In [12]:
df['ipn_all_percent'] = (df['ipnr_all'] / df['ipn_all']).fillna(0.5)
df['ipn_1_percent']   = (df['ipnr_1']   / df['ipn_1']).fillna(0.5)
df['ipn_7_percent']   = (df['ipnr_7']   / df['ipn_7']).fillna(0.5)
df['ipn_30_percent']  = (df['ipnr_30']  / df['ipn_30']).fillna(0.5)
df['ipn_365_percent'] = (df['ipnr_365'] / df['ipn_365']).fillna(0.5)

df.drop(['ipnr_all', 'ipn_all', 'ipnr_1', 'ipn_1', 'ipnr_7', 'ipn_7', 'ipnr_30', 'ipn_30', 'ipnr_365', 'ipn_365'], axis=1, inplace=True)

A lot of fields that follow powerlaw distribution and thus there is very small number of people with a huge numbers. I trim this data and sometimes apply log transofmation to some of the field. I tried various variants of these trimmers with no significant change.

In [13]:
df[df['age_sent'] >= df['age_sent'].quantile(0.9)] = df['age_sent'].quantile(0.98)
data_arr = [
    ('last_seen', 400, 500),
    ('forum_reply', 80, 100),
    ('forum_cnt', 40, 60),
    ('forum_exp', 5, 7),
    ('forum_quest', 5, 7),
    ('clc_all', 40, 60),
    ('clc_365', 40, 60),
    ('cpc_all', 100, 120),
    ('cpc_7', 20, 30),
    ('cpc_30', 50, 60),
    ('cpc_365', 100, 110),
    ('subm_1', 40, 60),
    ('subm_c_1', 40, 50),
    ('subm_m_1', 40, 50)
]
for name, max_val, cap_val in data_arr:
    df[df[name] >= max_val] = cap_val

def log_transform(x):
    return 0 if x == 0 else np.log2(x) + 1
    
for name in [
    'subm_all', 'subm_30', 'subm_365', 'subm_c_all', 'subm_c_7', 'subm_7', 'subm_c_365', 'subm_c_30',
    'subm_m_all', 'subm_m_7', 'subm_m_30', 'subm_m_365'
]:
    df[name] = df[name].apply(log_transform)

Finally creating dummies from categorical variables, creating X, Y matrices and scaling the features.

In [14]:
df_x = pd.get_dummies(df, columns=['mail_id', 'mail_cat', 'timezone'])

X_train = df_x.values[:len(Y), 1:]
y_train = Y.values

X_test  = df_x.values[len(Y):, 1:]

In [15]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test  = min_max_scaler.transform(X_test)

Knowing that F1 score does not take into account True Negatives, I played with class_weight scores. It actually gave reasonable imporovement. The number 730 was selected after I tried various combinations from 500 to 1000.

In [16]:
%%time
weight = 730
clf = linear_model.LogisticRegression(class_weight={1: weight, 0: 1000 - weight}, C=0.3)
clf.fit(X_train, y_train);

CPU times: user 2min 58s, sys: 3.46 s, total: 3min 1s
Wall time: 3min 6s


In [17]:
f1(y_train, clf.predict(X_train))

0.6930146493485192

Additionally changing some of the scores for poeple who almost never opened an email and people who almost always opened.

In [22]:
num_open = 15
epsilon = 0.2
set_user_open_0 = set(user_info[(user_info['opened_rate'] <= 0 + epsilon) & (user_info['all_num'] > num_open)]['user_id'].tolist())
set_user_open_1 = set(user_info[(user_info['opened_rate'] >= 1 - epsilon) & (user_info['all_num'] > num_open)]['user_id'].tolist())

In [23]:
def change_res(row):
    if row['user_id'] in set_user_open_0:
        return 0
    if row['user_id'] in set_user_open_1:
        return 1
    return row['result']

In [24]:
res_df = pd.concat([
    pd.DataFrame(clf.predict(X_test), columns=['result']),
    pd.DataFrame(df_x.loc[len(Y):]['user_id'].reset_index(drop=True))
], axis=1)

res_df['result']  = res_df['result'].astype(int)
res_df['user_id'] = res_df['user_id'].astype(int)
res_df['result'] = res_df.apply(change_res, axis=1)

In [25]:
res_df[['result']].to_csv("output_03.csv", index=False, header=False)