# Create Lexicon Sentiment Score using PMI score
Created by Sabrina Tiun (Julai, 2023) 

References:<br>
[1]https://www.researchgate.net/publication/332372512_Aspect_Extraction_and_Sentiment_Analysis_in_User_Reviews_in_Russian_about_Bank_Service_Quality#pf2 <br>

In [5]:
import turicreate as tc

In [6]:
#load sFrame lexicon dict
sf_train = tc.load_sframe('sframe_train')
sf_train

sentiment,text
pos,"[sedap, taste, sedap, rasa, coklat, semua, ..."
neg,"[barang, tak, pos, apa, cerita, ni, service, ..."


### PMI lexicon score 

Formula (e.g to calculate the target word, w in class positive, pos):<br>
<b>PMI (w,pos) = log 2 * ( (count(w,pos)* N)/ (count (w) * count (pos)) <p></b>
count(w, pos) is the count of word w in positive reviews [ count_w_neg, count_w_pos]<br>
N is total number of words in all the reviews [N] <br>
count(w) is the count of word w in all the reviews [count_neg]<br>
count(pos) is the total number of words in positive review [count_pos].<br>


In [109]:
docs = tc.SArray(sf_train['text'])
docs

dtype: list
Rows: 2
[['sedap', 'taste', 'sedap', 'rasa', 'coklat', 'semua', 'cukup', 'quality', 'good', 'quality', 'baik', 'taste', 'sedap', 'belikan', 'untuk', 'adik', 'yang', 'mengidam', 'sangat', 'nak', 'batang', 'buruk', 'tawakal', 'saja', 'beli', 'ni', 'masa', 'itu', 'tak', 'sangka', 'adik', 'cakap', 'sedap', 'kita', 'pula', 'tertelan', 'air', 'liur', 'haha', 'kena', 'beli', 'juga', 'nanti', 'untuk', 'diri', 'pula', 'ni', 'alhamdulillah', 'sampai', 'dah', 'sedap', 'tapi', 'hancur', 'kena', 'pakai', 'sudu', 'makan', 'padan', 'la', 'beli', 'time', 'shocking', 'sale', 'bungkusan', 'selamat', 'tak', 'dak', 'pecah', 'dah', 'banyak', 'kali', 'repeat', 'memang', 'tor', 'baik', 'alhamdulillah', 'sedap', 'sangat', 'sangat', 'beli', 'la', 'jangan', 'ragu', 'lagi', 'sampai', 'dengan', 'selamat', 'repeat', 'order', 'berkali', 'sudah', 'terima', 'kasih', 'thank', 'you', 'barang', 'baru', 'saja', 'dterima', 'wah', 'bekas', 'yang', 'bagus', 'tak', 'mudah', 'pecah', 'thank', 'you', 'sekali', 'lag

### a. count frequency of specific word --> count (w)

In [159]:
#flatten list (combine review list of negative and postive)
def flatten_concatenation(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
        return flat_list

all_words_lista = flatten_concatenation(docs)
all_words_lista #contains all words from both classes

['sedap',
 'taste',
 'sedap',
 'rasa',
 'coklat',
 'semua',
 'cukup',
 'quality',
 'good',
 'quality',
 'baik',
 'taste',
 'sedap',
 'belikan',
 'untuk',
 'adik',
 'yang',
 'mengidam',
 'sangat',
 'nak',
 'batang',
 'buruk',
 'tawakal',
 'saja',
 'beli',
 'ni',
 'masa',
 'itu',
 'tak',
 'sangka',
 'adik',
 'cakap',
 'sedap',
 'kita',
 'pula',
 'tertelan',
 'air',
 'liur',
 'haha',
 'kena',
 'beli',
 'juga',
 'nanti',
 'untuk',
 'diri',
 'pula',
 'ni',
 'alhamdulillah',
 'sampai',
 'dah',
 'sedap',
 'tapi',
 'hancur',
 'kena',
 'pakai',
 'sudu',
 'makan',
 'padan',
 'la',
 'beli',
 'time',
 'shocking',
 'sale',
 'bungkusan',
 'selamat',
 'tak',
 'dak',
 'pecah',
 'dah',
 'banyak',
 'kali',
 'repeat',
 'memang',
 'tor',
 'baik',
 'alhamdulillah',
 'sedap',
 'sangat',
 'sangat',
 'beli',
 'la',
 'jangan',
 'ragu',
 'lagi',
 'sampai',
 'dengan',
 'selamat',
 'repeat',
 'order',
 'berkali',
 'sudah',
 'terima',
 'kasih',
 'thank',
 'you',
 'barang',
 'baru',
 'saja',
 'dterima',
 'wah',
 'b

In [160]:
print(len(all_words_lista))

13785


In [158]:
def flatten(l):
    return [item for sublist in l for item in sublist]
all_words_listb = flatten(docs)
all_words_listc #contains all words from both classes

['sedap',
 'taste',
 'sedap',
 'rasa',
 'coklat',
 'semua',
 'cukup',
 'quality',
 'good',
 'quality',
 'baik',
 'taste',
 'sedap',
 'belikan',
 'untuk',
 'adik',
 'yang',
 'mengidam',
 'sangat',
 'nak',
 'batang',
 'buruk',
 'tawakal',
 'saja',
 'beli',
 'ni',
 'masa',
 'itu',
 'tak',
 'sangka',
 'adik',
 'cakap',
 'sedap',
 'kita',
 'pula',
 'tertelan',
 'air',
 'liur',
 'haha',
 'kena',
 'beli',
 'juga',
 'nanti',
 'untuk',
 'diri',
 'pula',
 'ni',
 'alhamdulillah',
 'sampai',
 'dah',
 'sedap',
 'tapi',
 'hancur',
 'kena',
 'pakai',
 'sudu',
 'makan',
 'padan',
 'la',
 'beli',
 'time',
 'shocking',
 'sale',
 'bungkusan',
 'selamat',
 'tak',
 'dak',
 'pecah',
 'dah',
 'banyak',
 'kali',
 'repeat',
 'memang',
 'tor',
 'baik',
 'alhamdulillah',
 'sedap',
 'sangat',
 'sangat',
 'beli',
 'la',
 'jangan',
 'ragu',
 'lagi',
 'sampai',
 'dengan',
 'selamat',
 'repeat',
 'order',
 'berkali',
 'sudah',
 'terima',
 'kasih',
 'thank',
 'you',
 'barang',
 'baru',
 'saja',
 'dterima',
 'wah',
 'b

In [None]:
print(len(all_words_listb))

In [146]:
#calculate the unique word with its frequency from both class of sentiment --> count (w)
word_freq=[all_words_list.count(p) for p in all_words_list]
all_word_dict = dict(zip(all_words_list,word_freq)) # this create dictionary of unique word and its frequency

In [147]:
sort_all_word_dict =  OrderedDict(sorted(all_word_dict.items()))
sort_all_word_dict #final output from descending order aplhabetically

OrderedDict([('', 1),
             ('abang', 2),
             ('abeh', 1),
             ('actually', 2),
             ('acuh', 2),
             ('ada', 111),
             ('adik', 5),
             ('adjust', 1),
             ('adooi', 1),
             ('adoyai', 1),
             ('after', 1),
             ('again', 8),
             ('agak', 51),
             ('agar', 1),
             ('agent', 1),
             ('agr', 1),
             ('air', 3),
             ('ais', 1),
             ('aje', 7),
             ('akan', 16),
             ('akhir', 3),
             ('akhirat', 1),
             ('aktiviti', 1),
             ('aku', 6),
             ('alamat', 1),
             ('alangkah', 1),
             ('alhamdulillah', 66),
             ('alhamdullilah', 2),
             ('alhmdulillah', 1),
             ('all', 4),
             ('allah', 6),
             ('already', 1),
             ('also', 2),
             ('always', 1),
             ('amat', 2),
             ('ambil', 10),
         

### b. Count frequency of a word, w in each classes --> count(w,pos) and count(w,neg)

In [148]:
#convert list into dictionary with key as the word, word frequency as the value
docs = tc.SArray(sf_train['text'])
docs_cw = tc.text_analytics.count_words(docs)

In [96]:
#create count(w, pos)
from collections import OrderedDict

count_wdict_pos = OrderedDict(sorted(docs_cw[0].items())) # index 0 is positive sentiment

In [97]:
count_wdict_pos

OrderedDict([('abang', 2.0),
             ('abeh', 1.0),
             ('actually', 2.0),
             ('acuh', 2.0),
             ('ada', 111.0),
             ('adik', 5.0),
             ('adjust', 1.0),
             ('adooi', 1.0),
             ('adoyai', 1.0),
             ('after', 1.0),
             ('again', 8.0),
             ('agak', 51.0),
             ('agar', 1.0),
             ('agent', 1.0),
             ('agr', 1.0),
             ('air', 3.0),
             ('ais', 1.0),
             ('aje', 7.0),
             ('akan', 16.0),
             ('akhir', 3.0),
             ('akhirat', 1.0),
             ('aktiviti', 1.0),
             ('aku', 6.0),
             ('alamat', 1.0),
             ('alangkah', 1.0),
             ('alhamdulillah', 66.0),
             ('alhamdullilah', 2.0),
             ('alhmdulillah', 1.0),
             ('all', 4.0),
             ('allah', 6.0),
             ('already', 1.0),
             ('also', 2.0),
             ('always', 1.0),
             ('amat

In [98]:
count_pos = len(count_wdict_pos)
count_pos

1715

In [100]:
#create count(w, neg)
from collections import OrderedDict

count_wdict_neg = OrderedDict(sorted(docs_cw[1].items())) # index 1 is negative sentiment

In [101]:
count_wdict_neg

OrderedDict([('about', 1.0),
             ('acaner', 1.0),
             ('accept', 1.0),
             ('act', 1.0),
             ('ada', 147.0),
             ('adalah', 2.0),
             ('adik', 2.0),
             ('adil', 1.0),
             ('adjust', 1.0),
             ('aduan', 1.0),
             ('aduh', 4.0),
             ('adunan', 1.0),
             ('advice', 1.0),
             ('after', 3.0),
             ('agak', 31.0),
             ('ah', 1.0),
             ('ahad', 1.0),
             ('air', 4.0),
             ('ais', 3.0),
             ('ajar', 3.0),
             ('akan', 10.0),
             ('akg', 1.0),
             ('akhir', 2.0),
             ('akhirat', 1.0),
             ('aku', 7.0),
             ('alahai', 2.0),
             ('alam', 5.0),
             ('alamat', 1.0),
             ('alami', 1.0),
             ('alang', 1.0),
             ('alasan', 5.0),
             ('alert', 4.0),
             ('alhamdulillah', 7.0),
             ('all', 6.0),
             ('a

### c. Count frequency of words in each classes --> count(pos) and count(neg)

In [103]:
count_neg = len(count_wdict_neg)
count_neg

1983

### d. count total of word in all classes --> N

In [104]:
# N value
N = count_pos+count_neg
N

3698

## Step 2: Calculate PMI score for each word

In [157]:
# print all values needed in positive class
w = 'sama'
cwordpos = count_wdict_pos[w]
cword  = all_word_dict[w]
ctotalwordpos = count_pos
print ('count(w,pos):', cwordpos)
print ('count(pos):', ctotalwordpos)
print ('count(w):', cwordpos)
print ('N:', N)

count(w,pos): 6.0
count(pos): 1715
count(w): 6.0
N: 3698


In [162]:
#pmi score calculation
import math

def pmi(cw, N, cwsenti, cwwsenti):
    nomi= cwwsenti * N 
    denom = cw * cwsenti
    pmi_score = math.log(nomi/denom,2)
    return pmi_score

mypmi_score = pmi(6,3698,6,6)
mypmi_score

9.267567008683042

In [136]:
#save sFrame of adjusted TFIDF
tc.SFrame.save(sf_PMI_pos,"sframe_adjustedTFIDF_pos")
tc.SFrame.save(sf_PMI_neg,"sframe_adjustedTFIDF_neg")