* lowcolで特徴ベクトルを算出する

In [None]:
!pip install git+https://github.com/darecophoenixx/wordroid.sblo.jp

In [None]:
from feature_eng import lowcols

In [None]:
%matplotlib inline
from IPython.display import SVG, Image
from keras.utils.vis_utils import model_to_dot

import os.path
import sys
import re
import itertools
import csv
import datetime
import pickle
import random
from collections import defaultdict, Counter
import gc

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import gensim
from sklearn.metrics import f1_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.sequence import skipgrams
import tensorflow as tf

In [None]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.light_palette(color, as_cmap=True)
    plt.hexbin(x, y, cmap=cmap, **kwargs)
def scatter(x, y, color, **kwargs):
    plt.scatter(x, y, marker='.')

In [None]:
img_cnt = 0
def save_img():
    global img_cnt
    img_cnt += 1
    img_name = 'img_{0:03}.jpeg'.format(img_cnt)
    print(img_name)
    plt.savefig(img_name, pil_kwargs={'quality': 100})

# load sample data

In [None]:
X_df = pd.read_csv('../input/sample-data-wordanddoc2vec/sample011.csv', index_col=0)
X_df

In [None]:
plt.figure(figsize=(15, 15))
plt.imshow(X_df.values.T)
save_img()

# WordAndDoc2vec

In [None]:
wd2v = lowcols.WD2vec(X_df)
wd2v

In [None]:
# num_features = 2

# models = wd2v.make_model(num_features=num_features, seed=101, embeddings_val=0.2)

# print('\n\n##################### model_gk1 >>>')
# models['model_gk1'].summary()
# print('\n\n##################### model_user >>>')
# models['model_user'].summary()
# print('\n\n##################### model >>>')
# model = models['model']
# model.summary()

In [None]:
num_features = 2

from sklearn import decomposition
pca = decomposition.PCA(n_components=num_features)
pca.fit(X_df)
rscore = pca.transform(X_df) / 5
print(rscore.shape)

#cscore = pca.components_.T / 3
cscore = np.zeros((0,num_features))
for icol in X_df:
    v = X_df[icol]
    tmp = rscore[v==1].mean(axis=0, keepdims=True)
    cscore = np.r_[cscore, tmp]
print(cscore.shape)

models = wd2v.make_model(num_features=num_features, rscore=rscore, cscore=cscore)

print('\n\n##################### model_gk1 >>>')
models['model_gk1'].summary()
print('\n\n##################### model_user >>>')
models['model_user'].summary()
print('\n\n##################### model >>>')
model = models['model']
model.summary()

In [None]:
wgt_lm = wd2v.get_wgt_bycol()
print(wgt_lm.shape)
df = pd.DataFrame(wgt_lm[:,:5])
sns.set_context('paper')
g = sns.PairGrid(df, height=3.5)
g.map_diag(plt.hist, edgecolor="w")
g.map_lower(scatter)
g.map_upper(hexbin)
save_img()

In [None]:
wgt_user = wd2v.get_wgt_byrow()
# wgt_user = model.get_layer('user_embedding').get_weights()[0]
print(wgt_user.shape)
df = pd.DataFrame(wgt_user[:,:5])
sns.set_context('paper')
g = sns.PairGrid(df, height=3.5)
g.map_diag(plt.hist, edgecolor="w")
g.map_lower(scatter)
g.map_upper(hexbin)
save_img()

### train

In [None]:
%%time
hst, hst2 = wd2v.train(epochs=500, batch_size=32, verbose=0, lr0=0.001)
hst_history = hst.history

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20,5))
ax[0].set_title('loss')
ax[0].plot(list(range(len(hst_history["loss"]))), hst_history["loss"], label="Train loss")
ax[1].set_title('acc')
ax[1].plot(list(range(len(hst_history["loss"]))), hst_history["binary_accuracy"], label="accuracy")
ax[2].set_title('learning rate')
ax[2].plot(list(range(len(hst_history["loss"]))), hst_history["lr"], label="learning rate")
ax[0].legend()
ax[1].legend()
ax[2].legend()
save_img()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20,5))
ax[0].set_title('loss')
ax[0].plot(list(range(len(hst2.history["loss"]))), hst2.history["loss"], label="Train loss")
ax[1].set_title('acc')
ax[1].plot(list(range(len(hst2.history["loss"]))), hst2.history["binary_accuracy"], label="accuracy")
ax[2].set_title('learning rate')
ax[2].plot(list(range(len(hst2.history["loss"]))), hst2.history["lr"], label="learning rate")
ax[0].legend()
ax[1].legend()
ax[2].legend()
save_img()

In [None]:
wgt_prod = wd2v.get_wgt_bycol()
print(wgt_prod.shape)
df = pd.DataFrame(wgt_prod[:,:5])
sns.pairplot(df, markers='o', height=3.5, diag_kind='hist')
save_img()

In [None]:
wgt_user = wd2v.get_wgt_byrow()
print(wgt_user.shape)
df = pd.DataFrame(wgt_user[:,:5])
sns.pairplot(df, markers='o', height=3.5, diag_kind='hist')
save_img()

In [None]:
df1 = pd.DataFrame(wgt_prod)
df1['brand_group'] = 'U'
df2 = pd.DataFrame(wgt_user)
df2['brand_group'] = 'B'
df = pd.concat([df2, df1])

plt.figure(figsize=(15,15))
ax = sns.kdeplot(data=df2, x=0, y=1,
                 levels=5, thresh=.1, hue='brand_group')
for ee in df1.iterrows():
    ax.scatter(ee[1][0], ee[1][1], s=200, marker='X', c='darkblue')
ax
save_img()

In [None]:
df_out_wgt_user = pd.DataFrame(wgt_user, index=X_df.index)
df_out_wgt_user.head()

In [None]:
df_out_wgt_col = pd.DataFrame(wgt_prod, index=X_df.columns)
df_out_wgt_col.head()

In [None]:
df_out_wgt_user.to_csv('out_user.csv')
df_out_wgt_col.to_csv('out_col.csv')

In [None]:
gamma = wd2v.get_gamma()
gamma, np.sqrt((1/gamma) / 2)

In [None]:
np.save('out_gamma', gamma)