In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.stats import fisher_exact

### products

In [2]:
data = pd.read_csv('../data/gensim/prodvecs.csv')

In [3]:
model = KMeans(n_clusters = 30, precompute_distances = True, n_init=10, n_jobs = 4)
model.fit(data.iloc[:,1:])
data['prd_label'] = model.predict(data.iloc[:,1:])
data[['product_id','prd_label']].to_csv('../data/gensim/cluster_product.csv', index=False)

In [6]:
labels = data[['product_id','prd_label']]
dep = pd.read_csv('../data/model/dependent/dependent_n.csv')
dep = dep.merge(labels, on='product_id', how='inner')
cluster = pd.get_dummies(data=dep['prd_label'], prefix='prd_cls')
dep = dep.join(cluster)
dep =dep[dep['eval_set'] == 'valid']

In [7]:
for feat in reversed(['prd_cls_' + str(x) for x in range(30)]):
    oddsratio = pd.crosstab(dep[feat], dep['reordered']).reset_index(drop=True)
    oddsratio = np.mat(oddsratio)
    cross  = oddsratio[1,1]
    oddsratio = (oddsratio[0,0]*oddsratio[1,1]) / (oddsratio[1,0]*oddsratio[0,1])
    if cross > 2000 and (oddsratio < 0.80 or oddsratio > 1.2):
        print(feat, dep[feat].sum(), cross, round(oddsratio,4))

prd_cls_28 124148 14992 1.3093
prd_cls_23 44449 2356 0.5154
prd_cls_10 60128 4476 0.7436
prd_cls_3 281430 37296 1.5477
prd_cls_0 379096 29065 0.7274


### user clusters

In [8]:
data = pd.read_csv('../data/gensim/uservecs.csv')

In [9]:
model = KMeans(n_clusters = 50, precompute_distances = True, n_init=10, n_jobs = 4)
model.fit(data.iloc[:,1:])
data['usr_label'] = model.predict(data.iloc[:,1:])
data[['user_id','usr_label']].to_csv('../data/gensim/cluster_user.csv', index=False)

In [10]:
labels = data[['user_id','usr_label']]
dep = pd.read_csv('../data/model/dependent/dependent_n.csv')
dep = dep.merge(labels, on='user_id', how='inner')
cluster = pd.get_dummies(data=dep['usr_label'], prefix='prd_cls')
dep = dep.join(cluster)
dep =dep[dep['eval_set'] == 'valid']

In [11]:
for feat in reversed(['prd_cls_' + str(x) for x in range(50)]):
    oddsratio = pd.crosstab(dep[feat], dep['reordered']).reset_index(drop=True)
    oddsratio = np.mat(oddsratio)
    cross  = oddsratio[1,1]
    oddsratio = (oddsratio[0,0]*oddsratio[1,1]) / (oddsratio[1,0]*oddsratio[0,1])
    if cross > 1000 and (oddsratio <= 0.80 or oddsratio >= 1.2):
        print(feat, dep[feat].sum(), cross, round(oddsratio,4))

prd_cls_47 14152 1624 1.2116
prd_cls_41 41439 3206 0.7782
prd_cls_33 56822 6911 1.3046
prd_cls_28 40862 2886 0.704
prd_cls_27 25667 1959 0.7683
prd_cls_23 33446 2408 0.7199
prd_cls_17 25119 2898 1.2207
prd_cls_10 43501 5089 1.2436
prd_cls_9 7230 1500 2.4557
prd_cls_5 30236 4483 1.6415
prd_cls_4 44400 5512 1.3336
