In [1]:
import os
import sys
import pandas
import time
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

In [2]:
customer_labels = pandas.read_table('~/Downloads/customer_labels.csv', header=None, names=['cid', 'gender', 'cltv', 'churn'])
cltv_labels = pandas.read_table('~/Documents/labels_cltv.txt', header=None, names=['cid', 'cltv'])

In [3]:
labels_gender = customer_labels[['cid', 'gender']]
labels_churn = customer_labels[['cid', 'churn']]
labels_cltv = cltv_labels[['cid', 'cltv']]

##### Read W2V embeddings

In [4]:
w2v_size = 64
str_fname = '~/Documents/w2v_hour.txt'
embeddings = pandas.read_table(str_fname, sep=' ', header=None, usecols=range(0, w2v_size+1), skiprows=1)
embeddings = embeddings.rename(columns={0: 'cid'})
print embeddings.shape

(194574, 65)


In [5]:
customers_final_join = pandas.merge(embeddings, labels_gender, on='cid')
#customers_final_join = pandas.merge(embeddings, labels_churn, on='cid')

In [6]:
X = customers_final_join[customers_final_join.columns[1:w2v_size+1]].values
Y = customers_final_join['gender'].tolist()
CID = customers_final_join['cid'].tolist()
#Y = [1 if yy else 0 for yy in Y]
Y = [1 if yy == 'F' else 0 for yy in Y]

In [7]:
c_list = pandas.read_csv("list_c.csv", header=0, names=['idx', 'val'])

In [8]:
c_list = c_list["val"].values

In [9]:
X = np.asarray(X)
Y = np.asarray(Y)
CID = np.asarray(CID)

For choosing the W2V parameters

In [10]:
aucs = []
for i in range(0, 10):
    
    tr_folds_val = pandas.read_csv('tr_folds_val' + str(i) + ".csv", header=0, names=["idx", "val"])
    tr_folds_val = tr_folds_val["val"].values
    tr_folds_tr = pandas.read_csv('tr_folds_tr' + str(i) + ".csv", header=0, names=["idx", "val"])
    tr_folds_tr = tr_folds_tr["val"].values
    
    c_ids_val = c_list[tr_folds_val]
    c_ids_tr = c_list[tr_folds_tr]
    xsorted = np.argsort(CID)
    ypos = np.searchsorted(CID[xsorted], c_ids_val)
    idx_val = xsorted[ypos]
    ypos = np.searchsorted(CID[xsorted], c_ids_tr)
    idx_tr = xsorted[ypos]
    
    LR = LogisticRegression()
    LR.fit(X[idx_tr], Y[idx_tr])
    auc_val = roc_auc_score(Y[idx_val], LR.predict_proba(X[idx_val])[:, 1])
    pandas.DataFrame(LR.predict_proba(X[idx_val])[:, 1]).to_csv('val_scores' + str(i) + ".csv")
    pandas.DataFrame(Y[idx_val]).to_csv('val_scores_true' + str(i) + ".csv")
    print "AUC on VAL: " + str(auc_val)
    aucs.append(auc_val)
print(np.mean(aucs))

AUC on VAL: 0.883184465196
AUC on VAL: 0.88227436991
AUC on VAL: 0.882197016389
AUC on VAL: 0.882228169863
AUC on VAL: 0.882231165714
AUC on VAL: 0.882276911334
AUC on VAL: 0.882420644813
AUC on VAL: 0.881833508697
AUC on VAL: 0.882474519622
AUC on VAL: 0.882417110383
0.882353788192


For testing the AUC

In [11]:
aucs = []
for i in range(0, 10):
    
    tr_folds_te = pandas.read_csv('tr_folds_te' + str(i) + ".csv", header=0, names=["idx", "val"])
    tr_folds_te = tr_folds_te["val"].values
    tr_folds_tr = pandas.read_csv('tr_folds_tr' + str(i) + ".csv", header=0, names=["idx", "val"])
    tr_folds_tr = tr_folds_tr["val"].values
    
    c_ids_te = c_list[tr_folds_te]
    c_ids_tr = c_list[tr_folds_tr]
    xsorted = np.argsort(CID)
    ypos = np.searchsorted(CID[xsorted], c_ids_te)
    idx_te = xsorted[ypos]
    ypos = np.searchsorted(CID[xsorted], c_ids_tr)
    idx_tr = xsorted[ypos]
    
    LR = LogisticRegression()
    LR.fit(X[idx_tr], Y[idx_tr])
    auc_te = roc_auc_score(Y[idx_te], LR.predict_proba(X[idx_te])[:, 1])
    pandas.DataFrame(LR.predict_proba(X[idx_te])[:, 1]).to_csv('te_scores' + str(i) + ".csv")
    pandas.DataFrame(Y[idx_te]).to_csv('te_scores_true' + str(i) + ".csv")
    print "AUC on TE: " + str(auc_te)
    aucs.append(auc_te)
print(np.mean(aucs))

AUC on TE: 0.882459944028
AUC on TE: 0.8825281647
AUC on TE: 0.87850889274
AUC on TE: 0.879587582682
AUC on TE: 0.881570595922
AUC on TE: 0.880144767462
AUC on TE: 0.879672338652
AUC on TE: 0.877881443311
AUC on TE: 0.884977857317
AUC on TE: 0.884523924329
0.881185551114
