In [9]:
!pwd

/Users/yong8/Workspace/jupyter/r-exp-gen/aspect/data/electronics


In [1]:
import pandas as pd
import numpy as np
import json, pickle

In [65]:
'''
    :return df_aspects
        :columns (u_id, i_id, rating, aspect-list)
'''

def get_aspects(dataset):
    pairs = []
    with open('./data/{}/train.json'.format(dataset), 'r') as f:
        for l in f.readlines():
            l = eval(l)
            u_id = l['user']
            i_id = l['item']
            rating = l['rating']-1
            
            # l['sentence']: [(ASPECT, SENTIMENT-WORD, RELATED-SENTENCE, SENTIMENT-SCORE), ...]
            # if no topic extracted, then assign an empty list
            aspects = [ s[0] for s in l['sentence'] ] if 'sentence' in l.keys() else []
            pairs.append([u_id, i_id, rating, aspects])

    df_aspects = pd.DataFrame(pairs, columns=['u_id', 'i_id', 'rating', 'aspects'])
    return df_aspects

def get_pmis(df_per_aspect):
    '''
        :Calculate PMI for all aspects
        :return df_pmi
    '''
    # Get frequency matrix for all users, items, and users x items
    df_feature_freq_U = pd.crosstab(df_per_aspect.u_id, df_per_aspect.aspect)
    df_feature_freq_I = pd.crosstab(df_per_aspect.i_id, df_per_aspect.aspect)
    df_feature_freq_U_I = pd.crosstab([df_per_aspect.i_id, df_per_aspect.u_id], df_per_aspect.aspect)
    
    pmis = []
    for (i_id, u_id), freq_u_i in df_feature_freq_U_I.iterrows():
        freq_u = df_feature_freq_U.loc[u_id].values.astype('float64') # a freq vector for the user against all aspects
        freq_i = df_feature_freq_I.loc[i_id].values.astype('float64') # a freq vector for the item against all aspects
        freq_u_i = freq_u_i.values.astype('float64') # a freq vector for the (user x item) against all aspects

        pmi_u_i = pmi(freq_u_i, freq_u, freq_i)
        pmis.append(list(pmi_u_i))
    
    return pd.DataFrame(pmis, index=df_feature_freq_U_I.index, columns=df_feature_freq_U_I.columns)

def normalize(X):
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

def pmi(freq_u_i, freq_u, freq_i):
    '''
        :Calculate per-aspect PMI for each user x item
    '''
    
    score = np.divide(freq_u_i, (freq_u * freq_i), out=np.zeros_like(freq_u_i), where=((freq_u!=0)))
    #print('in pmi before: ', score)
    score = np.nan_to_num(score, nan=0)
    #els = np.nonzero(score)
    #print('print non-zeros: ', score[els])
    normalized_score = normalize(score + 1e-16)
    #els = np.nonzero(normalized_score)
    #print('in pmi after norm: ', normalized_score[els])
    return normalized_score

In [67]:
# Load extracted aspects
dataset = 'electronics'
df_aspects = get_aspects(dataset)

# Get the dataframe that has per-aspect rows
df_per_aspect = df_aspects.explode('aspects')
df_per_aspect.columns = ['u_id', 'i_id', 'rating', 'aspect']

coherence = get_pmis(df_per_aspect)
generality = df_pmi.groupby('i_id').aggregate('mean') # generality


[0. 0. 0. ... 0. 0. 0.]


  return np.log(np.divide(freq_u_i, (freq_u * freq_i), out=np.zeros_like(freq_u_i), where=((freq_u!=0))))
  return np.log(np.divide(freq_u_i, (freq_u * freq_i), out=np.zeros_like(freq_u_i), where=((freq_u!=0))))


Unnamed: 0_level_0,aspect,10x,120gb,120mm,12v,12x,140mm,1920x1080,1mm,1tb,20x,...,xbmc,youre,z77,zip,zip ties,zipper,zippers,zoom,zoom lens,zoom range
i_id,u_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0972683275,A1FGK3N8GZHGZJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0972683275,A1JI3XL5ME8CWM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0972683275,A1ZPEZMQBCTZQN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0972683275,A2JDDUN6YNRYKU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0972683275,A3ATF868IEP191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B00KOHKRXE,A18FUBX0QLXMZS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00KOHQU58,A14RPY0DV96EG4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00KOHQU58,A18FUBX0QLXMZS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00KWHMR6G,A3U41ZL33SS92P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Get frequency matrix for all users, items, and users x items
df_feature_freq_U = pd.crosstab(df_per_aspect.u_id, df_per_aspect.aspect)
df_feature_freq_I = pd.crosstab(df_per_aspect.i_id, df_per_aspect.aspect)
df_feature_freq_U_I = pd.crosstab([df_per_aspect.i_id, df_per_aspect.u_id], df_per_aspect.aspect)

In [None]:


pmis = []
for i, ((i_id, u_id), freq_u_i) in enumerate(df_feature_freq_U_I.iterrows()):
    freq_u = df_feature_freq_U.loc[u_id].values.astype('float64') # a freq vector for the user against all aspects
    freq_i = df_feature_freq_I.loc[i_id].values.astype('float64') # a freq vector for the item against all aspects
    freq_u_i = freq_u_i.values.astype('float64') # a freq vector for the (user x item) against all aspects

    pmi_u_i = pmi(freq_u_i, freq_u, freq_i)
    pmis.append(list(pmi_u_i))


df_pmi = pd.DataFrame(pmis, index=df_feature_freq_U_I.index, columns=df_feature_freq_U_I.columns)
df_pmi


In [84]:
np.count_nonzero(df_feature_freq_U)

71403

In [76]:
df_pmi['price'].loc[df_pmi['price']!=0]

Series([], Name: price, dtype: float64)

In [140]:
x = np.array([0,0,0,1,1])
norm1 = x / np.linalg.norm(x)
norm1

array([0.        , 0.        , 0.        , 0.70710678, 0.70710678])

In [138]:
np.linalg.norm(x)

1.4142135623730951

In [135]:
x

array([nan, inf,  1.])