# Create Lexicon Sentiment Score 
Created by Sabrina Tiun (September, 2022) 

Reference" https://www.researchgate.net/project/Estimating-Sentiment-through-Text-Analysis<br>
This notebook will prepare various of lexicon sentiment dict
- Using TF-IDF weighting score <br>
- Using unigram probability weighting score <br>
- Using the combination of TF-IDF and unigram probability weighting score<br>
- Using simple 1 and -1 score

In [41]:
import turicreate as tc

In [42]:
#load sFrame lexicon dict
sf_train = tc.load_sframe('sframe_train')
sf_train

sentiment,text
pos,"[sedap, taste, sedap, rasa, coklat, semua, ..."
neg,"[barang, tak, pos, apa, cerita, ni, service, ..."


### Adjusted TFIDF lexicon score with min value
Due to the existence of zero weight/score of a lexicon, need to adjusted the zero score by giving the least minimum value: <br>
step 1 :  Calculate the tf-idf of each term in each review label <br>
step 2 : find the lowest min value after zero using 'sorted' function <br>

In [128]:
#docs_tfidf[0] = positive, docs_tfidf[1] = negative 
#output will be a dictonary of key 0 as positive and key 1 as negative
docs = tc.SArray(sf_train['text'])
docs_tfidf = tc.text_analytics.tf_idf(docs)

In [129]:
#create negative lexicon
from collections import OrderedDict

tfidf_NEG = OrderedDict(sorted(docs_tfidf[1].items())) # index 1 is negative sentiment

In [130]:
#replace 0 with 0.6
tfidf_NEG = [(key,0.6) if value == 0.0 else (key, value) for key,value in tfidf_NEG.items()]

In [131]:
tfidf_NEG

[('about', 0.6931471805599453),
 ('acaner', 0.6931471805599453),
 ('accept', 0.6931471805599453),
 ('act', 0.6931471805599453),
 ('ada', 0.6),
 ('adalah', 1.3862943611198906),
 ('adik', 0.6),
 ('adil', 0.6931471805599453),
 ('adjust', 0.6),
 ('aduan', 0.6931471805599453),
 ('aduh', 2.772588722239781),
 ('adunan', 0.6931471805599453),
 ('advice', 0.6931471805599453),
 ('after', 0.6),
 ('agak', 0.6),
 ('ah', 0.6931471805599453),
 ('ahad', 0.6931471805599453),
 ('air', 0.6),
 ('ais', 0.6),
 ('ajar', 2.0794415416798357),
 ('akan', 0.6),
 ('akg', 0.6931471805599453),
 ('akhir', 0.6),
 ('akhirat', 0.6),
 ('aku', 0.6),
 ('alahai', 1.3862943611198906),
 ('alam', 3.4657359027997265),
 ('alamat', 0.6),
 ('alami', 0.6931471805599453),
 ('alang', 0.6931471805599453),
 ('alasan', 3.4657359027997265),
 ('alert', 2.772588722239781),
 ('alhamdulillah', 0.6),
 ('all', 0.6),
 ('allah', 0.6),
 ('almond', 0.6931471805599453),
 ('almost', 0.6931471805599453),
 ('alot', 0.6931471805599453),
 ('alsan', 0.693

sorted(sf_tfidif_neg['score'])
#the lowest value after 0 =  0.6931471805599453 -> 0.6

In [133]:
tfidf_POS = OrderedDict(sorted(docs_tfidf[0].items()))
tfidf_POS

OrderedDict([('', 0.6931471805599453),
             ('abang', 1.3862943611198906),
             ('abeh', 0.6931471805599453),
             ('actually', 1.3862943611198906),
             ('acuh', 1.3862943611198906),
             ('ada', 0.0),
             ('adik', 0.0),
             ('adjust', 0.0),
             ('adooi', 0.6931471805599453),
             ('adoyai', 0.6931471805599453),
             ('after', 0.0),
             ('again', 5.545177444479562),
             ('agak', 0.0),
             ('agar', 0.6931471805599453),
             ('agent', 0.6931471805599453),
             ('agr', 0.6931471805599453),
             ('air', 0.0),
             ('ais', 0.0),
             ('aje', 4.852030263919617),
             ('akan', 0.0),
             ('akhir', 0.0),
             ('akhirat', 0.0),
             ('aktiviti', 0.6931471805599453),
             ('aku', 0.0),
             ('alamat', 0.0),
             ('alangkah', 0.6931471805599453),
             ('alhamdulillah', 0.0),
          

sorted(sf_tfidif_pos['score'])
#the lowest value after 0 =  0.6931471805599453 -> 0.6

In [134]:
#replace 0 with 0.6
tfidf_POS = [(key,0.6) if value == 0.0 else (key, value) for key,value in tfidf_POS.items()]

In [135]:
tfidf_POS

[('', 0.6931471805599453),
 ('abang', 1.3862943611198906),
 ('abeh', 0.6931471805599453),
 ('actually', 1.3862943611198906),
 ('acuh', 1.3862943611198906),
 ('ada', 0.6),
 ('adik', 0.6),
 ('adjust', 0.6),
 ('adooi', 0.6931471805599453),
 ('adoyai', 0.6931471805599453),
 ('after', 0.6),
 ('again', 5.545177444479562),
 ('agak', 0.6),
 ('agar', 0.6931471805599453),
 ('agent', 0.6931471805599453),
 ('agr', 0.6931471805599453),
 ('air', 0.6),
 ('ais', 0.6),
 ('aje', 4.852030263919617),
 ('akan', 0.6),
 ('akhir', 0.6),
 ('akhirat', 0.6),
 ('aktiviti', 0.6931471805599453),
 ('aku', 0.6),
 ('alamat', 0.6),
 ('alangkah', 0.6931471805599453),
 ('alhamdulillah', 0.6),
 ('alhamdullilah', 1.3862943611198906),
 ('alhmdulillah', 0.6931471805599453),
 ('all', 0.6),
 ('allah', 0.6),
 ('already', 0.6931471805599453),
 ('also', 0.6),
 ('always', 0.6931471805599453),
 ('amat', 0.6),
 ('ambil', 0.6),
 ('amin', 1.3862943611198906),
 ('amt', 0.6931471805599453),
 ('amualaikum', 0.6),
 ('anak', 0.6),
 ('and',

In [136]:
#save sFrame of adjusted TFIDF
tc.SFrame.save(sf_tfidif_pos,"sframe_adjustedTFIDF_pos")
tc.SFrame.save(sf_tfidif_neg,"sframe_adjustedTFIDF_neg")