## Test codes for score.py
- Start with test.json (with pairs of extracted aspects)
- Get PMI scores
- PMI scores are converted to coherence, generality

In [1]:
import pandas as pd
import numpy as np
import json, pickle

In [2]:
'''
    :return df_aspects
        :columns (u_id, i_id, rating, aspect-list)
'''

def get_aspects(dataset):
    pairs = []
    with open('./data/{}/test.json'.format(dataset), 'r') as f:
        for l in f.readlines():
            l = eval(l)
            u_id = l['user']
            i_id = l['item']
            rating = l['rating']-1
            
            # l['sentence']: [(ASPECT, SENTIMENT-WORD, RELATED-SENTENCE, SENTIMENT-SCORE), ...]
            # if no topic extracted, then assign an empty list
            aspects = [ s[0] for s in l['sentence'] ] if 'sentence' in l.keys() else []
            pairs.append([u_id, i_id, rating, aspects])

    df_aspects = pd.DataFrame(pairs, columns=['u_id', 'i_id', 'rating', 'aspects'])
    return df_aspects

def get_pmis(df_per_aspect):
    '''
        :Calculate PMI for all aspects
        :return df_pmi
    '''
    # Get frequency matrix for all users, items, and users x items
    df_feature_freq_U = pd.crosstab(df_per_aspect.u_id, df_per_aspect.aspect)
    df_feature_freq_I = pd.crosstab(df_per_aspect.i_id, df_per_aspect.aspect)
    df_feature_freq_U_I = pd.crosstab([df_per_aspect.i_id, df_per_aspect.u_id], df_per_aspect.aspect)
    
    pmis = []
    for (i_id, u_id), freq_u_i in df_feature_freq_U_I.iterrows():
        freq_u = df_feature_freq_U.loc[u_id].values.astype('float64') # a freq vector for the user against all aspects
        freq_i = df_feature_freq_I.loc[i_id].values.astype('float64') # a freq vector for the item against all aspects
        freq_u_i = freq_u_i.values.astype('float64') # a freq vector for the (user x item) against all aspects

        pmi_u_i = pmi(freq_u_i, freq_u, freq_i)
        pmis.append(list(pmi_u_i))
    
    return pd.DataFrame(pmis, index=df_feature_freq_U_I.index, columns=df_feature_freq_U_I.columns)

def normalize(X):
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

def pmi(freq_u_i, freq_u, freq_i):
    '''
        :Calculate per-aspect PMI for each user x item
    '''
    
    score = np.divide(freq_u_i, (freq_u * freq_i), out=np.zeros_like(freq_u_i), where=((freq_u!=0)))
    #print('in pmi before: ', score)
    score = np.nan_to_num(score, nan=0)
    #els = np.nonzero(score)
    #print('print non-zeros: ', score[els])
    normalized_score = normalize(score + 1e-16)
    #els = np.nonzero(normalized_score)
    #print('in pmi after norm: ', normalized_score[els])
    return normalized_score

In [6]:
# Load extracted aspects
dataset = 'electronics'
df_aspects = get_aspects(dataset)

# Get the dataframe that has per-aspect rows
df_per_aspect = df_aspects.explode('aspects')
df_per_aspect.columns = ['u_id', 'i_id', 'rating', 'aspect']

df_pmi = get_pmis(df_per_aspect)
coherence = df_pmi.groupby('u_id').aggregate('mean')
generality = df_pmi.groupby('i_id').aggregate('mean') # generality




In [7]:
coherence

aspect,10x,120hz,120mm,120mm fans,12x,15x,200mm,20mm,20x,24pin,...,x64,youre,z77,zip,zipper,zippers,zoom,zoom lens,zoom range,zune
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A000715434M800HLCENK9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00101847G3FJTWYGNQA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A06278713EPAAUNFOJ25C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A08879353UXFSIU924O9D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100GMI0IGM050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZY7XD4EQAUUV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZYJE40XW6MFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZZ77XYX8O2WE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
AZZLZUR5X9GYE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Get frequency matrix for all users, items, and users x items
df_feature_freq_U = pd.crosstab(df_per_aspect.u_id, df_per_aspect.aspect)
df_feature_freq_I = pd.crosstab(df_per_aspect.i_id, df_per_aspect.aspect)
df_feature_freq_U_I = pd.crosstab([df_per_aspect.i_id, df_per_aspect.u_id], df_per_aspect.aspect)

In [84]:
np.count_nonzero(df_feature_freq_U)

71403

In [76]:
df_pmi['price'].loc[df_pmi['price']!=0]

Series([], Name: price, dtype: float64)

In [140]:
x = np.array([0,0,0,1,1])
norm1 = x / np.linalg.norm(x)
norm1

array([0.        , 0.        , 0.        , 0.70710678, 0.70710678])

In [138]:
np.linalg.norm(x)

1.4142135623730951

In [135]:
x

array([nan, inf,  1.])

In [5]:
df_feature_freq_U = pd.crosstab(df_per_aspect.u_id, df_per_aspect.aspect)

In [6]:
df_feature_freq_U

aspect,10x,20x,30x,3d,3rd party,3x,4x,5400rpm,5x,aa,...,wifi,will work,wire,wireless,wires,workout,x16,zip,zipper,zoom
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A06772321K8O6OL54F08V,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A101L4HF0IZ33C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A101RHMKIWMCRS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1031R8HD3E4GL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1034LJPZVRR7N,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZQ3DYNNVVG48,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZSH7IFNZR62F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZTX6MJ5DCPOA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZVA520PZOJR3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
