In [1]:
# Set up
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack

In [2]:
# Load data

datadir = "../data/final-data-files/"

ga_train = pd.read_csv(os.path.join(datadir, 'ga_train_final.csv'), index_col='device_id_enc')
ga_test = pd.read_csv(os.path.join(datadir, 'ga_test_final.csv'), index_col='device_id_enc')

pbdm = pd.read_csv(os.path.join(datadir, 'pbdm_final.csv'), index_col='device_id_enc')

app_id_sparse = pd.read_csv(os.path.join(datadir, 'app-data-sparse', 'app_id_sparse.csv'))

app_id_active_sparse = pd.read_csv(os.path.join(datadir, 'app-data-sparse', 'app_id_active_sparse.csv'))

category_sparse = pd.read_csv(os.path.join(datadir, 'app-data-sparse', 'category_sparse.csv'))

category_active_sparse = pd.read_csv(os.path.join(datadir, 'app-data-sparse', 'category_active_sparse.csv'))

app_id_encoder = pd.read_csv(os.path.join('../data/processed/app_id_encoder.csv'))
category_encoder = pd.read_csv(os.path.join('../data/processed/category_encoder.csv'))

In [3]:
# Add row numbers to ga_train
ga_train['train_row'] = np.arange(ga_train.shape[0])
ga_train['test_row'] = -1

# Add row numbers to ga_test
ga_test['train_row'] = -1
ga_test['test_row'] = np.arange(ga_test.shape[0])

# Create all_devices table that combines ga_train and ga_test so we can do preprocessing for both at the same time
all_devices = pd.concat([(ga_train[['train_row', 'test_row']]), ga_test])

In [4]:
# Merge all devices into each sparse table to add row numbers to sparse table
# Also separate into train and test
app_id_sparse = app_id_sparse.merge(all_devices, how='left', left_on='device_id_enc', right_index=True)
app_id_sparse_train = app_id_sparse[app_id_sparse['train_row']>=0]
app_id_sparse_test = app_id_sparse[app_id_sparse['test_row']>=0]

app_id_active_sparse = app_id_active_sparse.merge(all_devices, how='left', left_on='device_id_enc', right_index=True)
app_id_active_sparse_train = app_id_active_sparse[app_id_active_sparse['train_row']>=0]
app_id_active_sparse_test = app_id_active_sparse[app_id_active_sparse['test_row']>=0]

category_sparse = category_sparse.merge(all_devices, how='left', left_on='device_id_enc', right_index=True)
category_sparse_train = category_sparse[category_sparse['train_row']>=0]
category_sparse_test = category_sparse[category_sparse['test_row']>=0]

category_active_sparse = category_active_sparse.merge(all_devices, how='left', left_on='device_id_enc', right_index=True)
category_active_sparse_train = category_active_sparse[category_active_sparse['train_row']>=0]
category_active_sparse_test = category_active_sparse[category_active_sparse['test_row']>=0]

#### Sparse Matrix for PBDM

In [5]:
# Create sparse matrix for PBDM

# First merge in pbdm
all_devices_pbdm = all_devices.merge(pbdm, how='left', left_index=True, right_index=True)[['train_row', 'test_row', 'phone_brand_enc', 'device_model_enc']]

# Separate into train and test
pbdm_train = all_devices_pbdm[all_devices_pbdm['train_row']>=0]
pbdm_test = all_devices_pbdm[all_devices_pbdm['test_row']>=0]

# Sparse matrix on phone brand for train and test
Xtrain_pb = csr_matrix((np.ones(pbdm_train.shape[0]), (pbdm_train.train_row, pbdm_train.phone_brand_enc)))
Xtest_pb = csr_matrix((np.ones(pbdm_test.shape[0]), (pbdm_test.test_row, pbdm_test.phone_brand_enc)))

# Sparse matrix on device model for train and test
Xtrain_dm = csr_matrix((np.ones(pbdm_train.shape[0]), (pbdm_train.train_row, pbdm_train.device_model_enc)))
Xtest_dm = csr_matrix((np.ones(pbdm_test.shape[0]), (pbdm_test.test_row, pbdm_test.device_model_enc)))

#### Sparse Matrix for App ID and Category

In [6]:
# Sparse matrix on app ID for train and test
Xtrain_appid = csr_matrix((np.ones(app_id_sparse_train.shape[0]),
                           (app_id_sparse_train.train_row, app_id_sparse_train.app_id_enc)),
                          shape=(ga_train.shape[0],app_id_encoder.shape[0]+1))

Xtest_appid = csr_matrix((np.ones(app_id_sparse_test.shape[0]),
                         (app_id_sparse_test.test_row, app_id_sparse_test.app_id_enc)),
                        shape=(ga_test.shape[0],app_id_encoder.shape[0]+1))

# Sparse matrix on app ID active for train and test
Xtrain_appida = csr_matrix((np.ones(app_id_active_sparse_train.shape[0]),
                               (app_id_active_sparse_train.train_row, app_id_active_sparse_train.app_id_enc)),
                              shape=(ga_train.shape[0],app_id_encoder.shape[0]+1))

Xtest_appida = csr_matrix((np.ones(app_id_active_sparse_test.shape[0]),
                               (app_id_active_sparse_test.test_row, app_id_active_sparse_test.app_id_enc)),
                              shape=(ga_test.shape[0],app_id_encoder.shape[0]+1))

# Sparse matrix on category for train and test
Xtrain_category = csr_matrix((np.ones(category_sparse_train.shape[0]),
                               (category_sparse_train.train_row, category_sparse_train.category_enc)),
                              shape=(ga_train.shape[0],category_encoder.shape[0]+1))

Xtest_category = csr_matrix((np.ones(category_sparse_test.shape[0]),
                               (category_sparse_test.test_row, category_sparse_test.category_enc)),
                              shape=(ga_test.shape[0],category_encoder.shape[0]+1))

# Sparse matrix on category active for train and test
Xtrain_categorya = csr_matrix((np.ones(category_active_sparse_train.shape[0]),
                               (category_active_sparse_train.train_row, category_active_sparse_train.category_enc)),
                              shape=(ga_train.shape[0],category_encoder.shape[0]+1))

Xtest_categorya = csr_matrix((np.ones(category_active_sparse_test.shape[0]),
                               (category_active_sparse_test.test_row, category_active_sparse_test.category_enc)),
                              shape=(ga_test.shape[0],category_encoder.shape[0]+1))

#### Implementation

In [7]:
# Combine all sparse matrices
Xtrain = hstack((Xtrain_pb, Xtrain_dm, Xtrain_appid, Xtrain_appida, Xtrain_category, Xtrain_categorya), format='csr')
Xtest = hstack((Xtest_pb, Xtest_dm, Xtest_appid, Xtest_appida, Xtest_category, Xtest_categorya), format='csr')

In [8]:
# Get labels
y = ga_train['group_enc']

In [9]:
# Run through Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

features = Xtrain
labels = y

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=1)

clf_device = LogisticRegression(multi_class='multinomial', solver='lbfgs')
clf_device.fit(X_train, y_train)

pred_device = clf_device.predict_proba(X_test)

In [10]:
log_loss(np.array(y_test), pred_device)

2.4899718663128869

#### Refinement

In [26]:
# Tune C value
c_tuning = [2.0, 1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]
log_loss_scores = []

for C in c_tuning:
    clf_tuning = LogisticRegression(C=C, multi_class='multinomial', solver='lbfgs')
    clf_tuning.fit(X_train, y_train)
    pred = clf_tuning.predict_proba(X_test)
    ll = log_loss(np.array(y_test), pred)
    log_loss_scores.append((C, ll))
print log_loss_scores

[(2.0, 2.5246550834140598), (1.0, 2.4899718663128869), (0.5, 2.4164753734715827), (0.1, 2.3378787435612702), (0.05, 2.3030158056680095), (0.01, 2.2825095875909676), (0.005, 2.2897505389247894), (0.001, 2.328692771053285)]


In [27]:
# Tune C further, focusing on 0.05 to 0.005
c_tuning = [0.05,0.0455,0.041,0.0365,0.032,0.0275,0.023,0.0185,0.014,0.0095,0.005]
log_loss_scores = []

for C in c_tuning:
    clf_tuning = LogisticRegression(C=C, multi_class='multinomial', solver='lbfgs')
    clf_tuning.fit(X_train, y_train)
    pred = clf_tuning.predict_proba(X_test)
    ll = log_loss(np.array(y_test), pred)
    log_loss_scores.append((C, ll))
print log_loss_scores
# best score: c=0.014, logloss=2.2815639930697595

[(0.05, 2.3030158056680095), (0.0455, 2.2994875644892621), (0.041, 2.2968234409285855), (0.0365, 2.2929992064314773), (0.032, 2.2897877637158279), (0.0275, 2.2866937547402197), (0.023, 2.2841777107477941), (0.0185, 2.2822195464368908), (0.014, 2.2815639930697595), (0.0095, 2.2827292396446244), (0.005, 2.2897505389247894)]


In [28]:
# Final C tuning
c_tuning = [0.0167,0.0149,0.0131,0.0113]
log_loss_scores = []

for C in c_tuning:
    clf_tuning = LogisticRegression(C=C, multi_class='multinomial', solver='lbfgs')
    clf_tuning.fit(X_train, y_train)
    pred = clf_tuning.predict_proba(X_test)
    ll = log_loss(np.array(y_test), pred)
    log_loss_scores.append((C, ll))
print log_loss_scores
# best score: c=0.0149, logloss=2.2815105672255389

[(0.0167, 2.2819129653591976), (0.0149, 2.2815105672255389), (0.0131, 2.2815849324156479), (0.0113, 2.2819902982359586)]


In [29]:
# Use C = 0.0149, output encoded submission file
clf = LogisticRegression(C=0.0149, multi_class='multinomial', solver='lbfgs')
clf.fit(Xtrain, y)
pred_test = clf.predict_proba(Xtest)
pred_test = pd.DataFrame(pred_test, index=ga_test.index)

# Write to csv
pred_test.to_csv('../submissions/encoded/prediction-0149.csv')

### Model Evaluation and Validation

#### Model Parameters

In [41]:
# Coefficients
pd.DataFrame(clf.coef_).transpose().describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
count,41124.0,41124.0,41124.0,41124.0,41124.0,41124.0,41124.0,41124.0,41124.0,41124.0,41124.0,41124.0
mean,-0.000223,-0.000178,-0.000256,-0.000174,-0.000182,3.5e-05,6.6e-05,-4e-05,-0.000103,0.000222,0.000322,0.000512
std,0.015302,0.012878,0.01197,0.013568,0.015274,0.014069,0.017283,0.016889,0.014073,0.015388,0.017246,0.018429
min,-0.2797,-0.239281,-0.168041,-0.231304,-0.29656,-0.323304,-0.440936,-0.284944,-0.228971,-0.336297,-0.351654,-0.439251
25%,-0.000717,-0.000874,-0.000853,-0.001072,-0.001317,-0.001026,-0.000956,-0.001644,-0.001373,-0.00165,-0.002014,-0.001494
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.578263,0.409697,0.333826,0.311585,0.436341,0.307559,0.484377,0.437992,0.308617,0.213233,0.298616,0.326533


In [57]:
for i in range(12):
    print i
    print sum(np.absolute(clf.coef_[i])>=0.05) / 41124.0

0
0.0168028401906
1
0.0146872872289
2
0.0124015173621
3
0.0155870051551
4
0.0196478941737
5
0.017362124307
6
0.0192588269624
7
0.0246085011186
8
0.0178970917226
9
0.0222740978504
10
0.0259216029569
11
0.0270401711896


In [59]:
clf.intercept_

array([-0.04212761, -0.32258107, -0.68226428, -0.30332332, -0.15669442,
       -0.37625077,  0.35199577,  0.55810838, -0.08935162,  0.25299597,
        0.45191995,  0.35757302])

In [68]:
pred_train = pd.DataFrame(clf.predict_proba(Xtrain))
actual_train = pd.DataFrame(y)

In [69]:
pred_train.to_csv('pred_train.csv')
actual_train.to_csv('actual_train.csv')

In [66]:
pd.DataFrame(y).shape

(74645, 1)

# =========== Scratch, Delete Later ==========

In [33]:
max(clf.coef_

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [17]:
# Try only using active for developing a model
Xtrain = hstack((Xtrain_pb, Xtrain_dm, Xtrain_appid, Xtrain_appida, Xtrain_category, Xtrain_categorya), format='csr')
Xtest = hstack((Xtest_pb, Xtest_dm, Xtest_appid, Xtest_appida, Xtest_category, Xtest_categorya), format='csr')
y = ga_train['group_enc']

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

features = Xtrain
labels = y

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=1)

clf_device = LogisticRegression(multi_class='multinomial', solver='lbfgs')
clf_device.fit(X_train, y_train)

pred_device = clf_device.predict_proba(X_test)

log_loss(np.array(y_test), pred_device)

# 2.47280834
# 2.447608
# 2.489972

2.4899718663128869

In [18]:
c_tuning = [1.0, 0.5, 0.1, 0.05, 0.01, 0.005]
log_loss_scores = []

for C in c_tuning:
    clf_tuning = LogisticRegression(C=C, multi_class='multinomial', solver='lbfgs')
    clf_tuning.fit(X_train, y_train)
    pred = clf_tuning.predict_proba(X_test)
    ll = log_loss(np.array(y_test), pred)
    log_loss_scores.append((C, ll))
print log_loss_scores

# [(1.0, 2.4728083456221461), (0.5, 2.4086033178319126), (0.1, 2.309619176820513), (0.05, 2.2876211025012108), (0.01, 2.2835842878600849), (0.005, 2.2948924676044133)]
# 2.28358
# 2.28175
# 2.282509

[(1.0, 2.4899718663128869), (0.5, 2.4164753734715827), (0.1, 2.3378787435612702), (0.05, 2.3030158056680095), (0.01, 2.2825095875909676), (0.005, 2.2897505389247894)]


In [8]:
#Xtrain_pb, Xtrain_dm, Xtrain_appid, Xtrain_appida, Xtrain_category, Xtrain_categorya

print Xtrain_pb.shape
print Xtrain_dm.shape
print ''
print Xtrain_appid.shape
print Xtrain_appida.shape
print ''
print Xtrain_category.shape
print Xtrain_categorya.shape

(74645, 131)
(74645, 1599)

(74645, 19238)
(74645, 19238)

(74645, 459)
(74645, 459)


In [9]:
print Xtest_pb.shape
print Xtest_dm.shape
print ''
print Xtest_appid.shape
print Xtest_appida.shape
print ''
print Xtest_category.shape
print Xtest_categorya.shape

(112071, 131)
(112071, 1599)

(112071, 19238)
(112071, 19238)

(112071, 459)
(112071, 459)
