In [1]:
import pandas as pd
import re
import numpy as np
from collections import Counter
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
nltk.download('punkt') 
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/sherryliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sherryliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Part1: TF-IDF

In [2]:
df = pd.read_csv('amazon_fine_foods.csv')
df.shape

(11903, 10)

In [3]:
df.drop_duplicates(['Summary', 'Text'], inplace=True)
df.shape

(4938, 10)

In [4]:
df.ProductId.unique()

array(['B002QWP89S', 'B000KV61FC', 'B0013NUGDE', 'B001EO5Q64',
       'B003B3OOPA', 'B0013A0QXC', 'B005K4Q1YA', 'B007JFMH8M'],
      dtype=object)

In [5]:
df = df[['ProductId', 'Summary', 'Text', 'Score']]

In [6]:
df['text_all'] = df['Summary'] + ' ' + df['Text']

In [7]:
def get_vec(words, stop=None):
    '''Set up countvectorizer with several parameters, print shape of vectorizer and return vectorizer in DataFrame'''
    vectorizer = CountVectorizer(stop_words=stop, lowercase=True, min_df=0.001) # only keep keywords that appear in more than 0.1% of the reviews
    X = vectorizer.fit_transform(words) 
    X = X.toarray()
    print(X.shape)
    feature = vectorizer.get_feature_names()
    corpus_df = pd.DataFrame(X, columns=feature)
    return corpus_df

In [8]:
corpus_df = get_vec(df['text_all'].tolist(), 'english')
corpus_df.sum().sort_values(ascending=False)[:10]

(4938, 3147)


br         3821
great      2443
oil        2271
good       2055
coconut    1924
love       1911
like       1899
coffee     1820
product    1674
just       1413
dtype: int64

To choose 2-4 products as target for TF-IDF, I group reviews by product ID and observe the top 3 keywords for each product. 

In [9]:
def get_product_vec(df, stop=None):
    pid = df.ProductId.unique().tolist()
    overall = pd.DataFrame(columns=[1, 2, 3], index=pid)
    for i in pid:
        review = df[df['ProductId'] == i]['text_all'].tolist()
        vectorizer = CountVectorizer(stop_words=stop, lowercase=True, min_df=0.001)
        X = vectorizer.fit_transform(review) 
        X = X.toarray()
        feature = vectorizer.get_feature_names()
        good_df = pd.DataFrame(X, columns=feature)
        keyword = good_df.sum().sort_values(ascending=False)[:3].index.tolist()
        for j in range(3):
            overall.loc[i, j+1] = keyword[j]
    return overall

In [10]:
overall = get_product_vec(df, 'english')
overall

Unnamed: 0,1,2,3
B002QWP89S,greenies,dog,dogs
B000KV61FC,toy,dog,br
B0013NUGDE,chips,br,flavor
B001EO5Q64,oil,coconut,br
B003B3OOPA,oil,coconut,hair
B0013A0QXC,coffee,senseo,pods
B005K4Q1YA,coffee,cappuccino,like
B007JFMH8M,cookie,cookies,soft


I choose B001EO5Q64 and B003B3OOPA as target since they share similar keywords.

In [11]:
picked_pid = ['B001EO5Q64', 'B003B3OOPA']
df = df[df['ProductId'].isin(picked_pid)]
df.shape

(1190, 5)

In [12]:
good_review = df[df['Score'] >= 4]['text_all'].tolist()
poor_review = df[df['Score'] < 4]['text_all'].tolist()

In [13]:
good_review_vec = get_vec(good_review, 'english')
good_review_vec.sum().sort_values(ascending=False)[:10]

(1124, 2929)


oil        2130
coconut    1840
br         1209
use         961
hair        864
product     845
great       845
skin        662
love        598
good        536
dtype: int64

In [14]:
poor_review_vec = get_vec(poor_review, 'english')
poor_review_vec.sum().sort_values(ascending=False)[:10]

(66, 1205)


oil          97
br           96
coconut      83
product      63
hair         47
like         35
use          34
skin         30
just         27
saturated    24
dtype: int64

In [15]:
## Reference: https://gist.github.com/gaurav5430/9fce93759eb2f6b1697883c3782f30de#file-nltk-lemmatize-sentences-py
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def lem(sentence):
    '''Intake a list of review, lemmatize them and return a new list'''
    result_lem = []
    for s in tqdm(sentence):
        s_lem = lemmatize_sentence(s)
        result_lem.append(s_lem)
    return result_lem

In [16]:
stopword_list = stopwords.words("english")
stopword_list.append('br')

def get_vec(words, stop=stopword_list):
    '''Revise the function by changing stop_words and adding token_pattern'''
    vectorizer = CountVectorizer(stop_words=stop, lowercase=True, min_df=0.001, token_pattern=r'[a-zA-Z\_]{3,}', binary=True)
    X = vectorizer.fit_transform(words) 
    X = X.toarray()
    print(X.shape)
    feature = vectorizer.get_feature_names()
    corpus_df = pd.DataFrame(X, columns=feature)
    return corpus_df

In [17]:
def word_replace_post_lem(line):
    '''After lemmitization, do the regex cleaning'''
    line = re.sub(r'\b((extra virgin )?coconut oil)\b', '_coconut_oil_', line)
    line = re.sub(r'\b(olive oil)\b', '_olive_oil_', line)
    line = re.sub(r'\b(buy|purchase|order)\b', '_buy_', line)
    line = re.sub(r'\b(jar|container)\b', '_container_', line)
    return line

In [18]:
def regex_lem(line):
    line = lem(line)
    line = [word_replace_post_lem(review.lower()) for review in line]
    return line

In [19]:
good_review_lem = regex_lem(good_review)
good_review_vec = get_vec(good_review_lem)
good_review_vec.sum().sort_values(ascending=False)[:10]

100%|██████████████████████████████████████| 1124/1124 [00:04<00:00, 231.97it/s]


(1124, 2488)


use              779
_coconut_oil_    714
great            565
product          558
love             433
oil              431
good             406
hair             405
_buy_            403
skin             390
dtype: int64

In [20]:
poor_review_lem = regex_lem(poor_review)
poor_review_vec = get_vec(poor_review_lem)
poor_review_vec.sum().sort_values(ascending=False)[:10]

100%|██████████████████████████████████████████| 66/66 [00:00<00:00, 269.57it/s]

(66, 1063)





product          42
use              39
_coconut_oil_    34
_buy_            33
like             28
oil              25
good             22
_container_      21
get              20
would            18
dtype: int64

To better understand the relationship and interpret the result of word count, I choose 2 as the n for n-gram.

In [21]:
def get_tfidf_vec(review):
    '''Intake a list of review and return tf-idf report'''
    vectorizer = TfidfVectorizer(ngram_range=(2,2),
                             token_pattern=r'\b[a-zA-Z\_]{3,}\b',
                             max_df=0.4, stop_words=stopword_list, max_features=1000, binary=True)
    X = vectorizer.fit_transform(review)
    terms = vectorizer.get_feature_names()
    tf_idf = pd.DataFrame(X.toarray().transpose(), index=terms)
    tf_idf = tf_idf.sum(axis=1)
    score = pd.DataFrame(tf_idf, columns=["score"])
    score.sort_values(by="score", ascending=False, inplace=True)
    return score

In [22]:
score_good = get_tfidf_vec(good_review_lem)
score_good[:10]

Unnamed: 0,score
great product,27.158786
use _coconut_oil_,23.42557
use hair,22.490846
also use,19.986221
organic _coconut_oil_,19.149359
_coconut_oil_ use,18.323303
use cook,17.336455
love _coconut_oil_,16.797652
coconut flavor,16.215402
love product,16.160967


In [23]:
score_poor = get_tfidf_vec(poor_review_lem)
score_poor[:10]

Unnamed: 0,score
plastic _container_,1.59296
_buy_ product,1.371997
read review,0.954246
_coconut_oil_ product,0.897701
hair skin,0.891208
use _coconut_oil_,0.84334
use cook,0.83399
product good,0.810991
one tub,0.80841
would _buy_,0.80763


Among good reviews, people frequently mention how they use the product. People usually use the product for their hair, skin and also cooking. "Flavor" and "taste" are also frequently mentioned in good reviews. In contrast, we can see that customers often complain about plastic containers. To be more specific, we also observe "oil leak" appears in top keywords in poor reviews. The manufacturer should revise thier design of containers and find out the reason that lead to leakage. Next, we can see that "read review" is in top keywords for poor review. After reading comments, we find that people often read the good review and give it a try but eventually find out the product doesn't meet their needs. We suggest the vendor can make a more detailed description in product introduction to let customers know what they should expect from the product, suitable type of hair & skin, etc. 

Overall, I think TF-IDF make sense since it can capture the correct keywords. However, it can not distinguish the difference of importance in 'summary' and 'text'. The content in summary should be more important in practice. In addition, I think we should also compare TF-IDF of good reviews and bad reviews so that we can capture keywords more precisely. Currently they share several keywords so that it's hard to tell what keywords are actually significant for good and poor reviews respectively.  

## Part 2: Similarity and Word Embeddings

In [26]:
df = pd.read_csv('amazon_fine_foods.csv')
df.drop_duplicates(['Summary', 'Text'], inplace=True)
df['text_all'] = df['Summary'] + ' ' + df['Text']
review_all = df['text_all'].tolist()

In [27]:
review_all_lem = lem(review_all)

100%|██████████| 4938/4938 [00:13<00:00, 363.89it/s]


In [28]:
def tfidf(review):
    '''Intake a list of review and return word count dataframe'''
    vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z\_]{3,}\b',
                             max_df=0.4, stop_words=stopword_list, max_features=1000, binary=True)
    X = vectorizer.fit_transform(review)
    terms = vectorizer.get_feature_names()
    tf_idf = pd.DataFrame(X.toarray(), columns=terms)
    return tf_idf

In [29]:
review_all_vec = tfidf(review_all_lem)
review_all_vec.head(10)

Unnamed: 0,able,absolute,absolutely,absorb,acid,acne,across,actual,actually,add,...,wrap,write,wrong,www,year,yes,yet,young,yum,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.154158,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.16259,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cosine similarity

In [30]:
distances = cosine_similarity(review_all_vec)
distances_df = pd.DataFrame(distances)
distances_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4928,4929,4930,4931,4932,4933,4934,4935,4936,4937
0,1.0,0.015773,0.025754,0.028843,0.022723,0.074381,0.069288,0.126992,0.105333,0.046702,...,0.020708,0.007243,0.025913,0.042291,0.072428,0.0,0.012497,0.0,0.01322,0.165724
1,0.015773,1.0,0.23487,0.113478,0.075278,0.065386,0.090242,0.122397,0.055267,0.054684,...,0.010391,0.079465,0.012979,0.0,0.030799,0.140169,0.0,0.080257,0.013266,0.0
2,0.025754,0.23487,1.0,0.399004,0.122919,0.045639,0.191024,0.039848,0.090243,0.055305,...,0.047929,0.041316,0.021193,0.0,0.050291,0.021654,0.021185,0.040691,0.046033,0.0
3,0.028843,0.113478,0.399004,1.0,0.137659,0.071574,0.168491,0.024777,0.101064,0.0,...,0.0,0.04997,0.023735,0.05404,0.022292,0.024251,0.0,0.021444,0.024259,0.0
4,0.022723,0.075278,0.122919,0.137659,1.0,0.056569,0.073732,0.105518,0.118657,0.0,...,0.0,0.029893,0.0,0.0,0.0,0.078405,0.0,0.0,0.0,0.0
5,0.074381,0.065386,0.045639,0.071574,0.056569,1.0,0.210429,0.102702,0.116293,0.054789,...,0.038405,0.062456,0.031926,0.023629,0.0,0.05215,0.0,0.0,0.0,0.095588
6,0.069288,0.090242,0.191024,0.168491,0.073732,0.210429,1.0,0.216015,0.242393,0.05457,...,0.02875,0.102446,0.0,0.018178,0.018226,0.0,0.034615,0.069412,0.059117,0.056481
7,0.126992,0.122397,0.039848,0.024777,0.105518,0.102702,0.216015,1.0,0.180838,0.097907,...,0.02745,0.065668,0.0,0.019918,0.049721,0.057788,0.010735,0.0,0.032956,0.044358
8,0.105333,0.055267,0.090243,0.101064,0.118657,0.116293,0.242393,0.180838,1.0,0.096136,...,0.037499,0.042253,0.05413,0.016997,0.087247,0.077311,0.014809,0.061809,0.062742,0.032419
9,0.046702,0.054684,0.055305,0.0,0.0,0.054789,0.05457,0.097907,0.096136,1.0,...,0.006808,0.021541,0.0,0.037896,0.0,0.0,0.011534,0.011729,0.07048,0.00994


In [31]:
similarity_table = distances_df.rename_axis(None).rename_axis(None, axis=1).stack().reset_index()
similarity_table.columns = ["review1", "review2", "similarity"]
def sort_indices(index1, index2)-> str:
    indices = [str(index1), str(index2)]
    indices.sort()
    return "".join(indices)

similarity_table["index"] = similarity_table.apply(lambda x: sort_indices(x.review1, x.review2), axis=1)

In [32]:
similarity_table.drop_duplicates("index", inplace=True)

In [33]:
top_5_most_similar = similarity_table[similarity_table.similarity < 0.999].sort_values(by="similarity", ascending=False).head(5)
top_5_most_similar

Unnamed: 0,review1,review2,similarity,index
17207674,3484,3682,0.992418,3484.03682.0
2612773,529,571,0.991852,529.0571.0
8949482,1812,1826,0.980817,1812.01826.0
19153443,3878,3879,0.975672,3878.03879.0
21247579,4302,4303,0.972717,4302.04303.0


In [34]:
print("Review 1:", review_all_lem[3484])
print("Review 2:", review_all_lem[3682])

Review 1: Great Taste Grove Square Cappuccino Cups be excellent . Tasted really good right from the Keurig brewer with nothing add . wWould highly recommend . RCCJR
Review 2: Excellent taste Grove Square Cappuccino Cups be excellent . Tasted really good right from the Keurig brewer with nothing add . wWould highly recommend . RCCJR


### Euclidean_distances

In [35]:
distances_e = euclidean_distances(review_all_vec)
distances_e_df = pd.DataFrame(distances_e)
distances_e_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4928,4929,4930,4931,4932,4933,4934,4935,4936,4937
0,0.0,1.403016,1.395884,1.39367,1.398054,1.360602,1.36434,1.321369,1.33766,1.380795,...,1.399494,1.409083,1.39577,1.383987,1.362037,1.414214,1.405349,1.414214,1.404834,1.291725
1,1.403016,0.0,1.237036,1.331557,1.359942,1.367197,1.348894,1.324842,1.374579,1.375002,...,1.406847,1.35686,1.405006,1.414214,1.392265,1.311359,1.414214,1.356276,1.404802,1.414214
2,1.395884,1.237036,0.0,1.096354,1.324447,1.381565,1.271988,1.38575,1.348894,1.374551,...,1.379907,1.38469,1.399147,1.414214,1.378194,1.398818,1.399153,1.385142,1.38128,1.414214
3,1.39367,1.331557,1.096354,0.0,1.313272,1.362664,1.28958,1.396583,1.340848,1.414214,...,1.414214,1.378427,1.39733,1.375471,1.398362,1.39696,1.414214,1.398968,1.396954,1.414214
4,1.398054,1.359942,1.324447,1.313272,0.0,1.373631,1.361079,1.337521,1.327662,1.414214,...,1.414214,1.392916,1.414214,1.414214,1.414214,1.357641,1.414214,1.414214,1.414214,1.414214
5,1.360602,1.367197,1.381565,1.362664,1.373631,0.0,1.256639,1.339625,1.329441,1.374926,...,1.386792,1.369339,1.391456,1.397406,1.414214,1.376844,1.414214,1.414214,1.414214,1.344926
6,1.36434,1.348894,1.271988,1.28958,1.361079,1.256639,0.0,1.252186,1.230941,1.375086,...,1.393736,1.339816,1.414214,1.401301,1.401266,1.414214,1.389521,1.364249,1.371775,1.373695
7,1.321369,1.324842,1.38575,1.396583,1.337521,1.339625,1.252186,0.0,1.27997,1.3432,...,1.394669,1.366991,1.414214,1.400059,1.378607,1.372743,1.406602,1.414214,1.390715,1.382492
8,1.33766,1.374579,1.348894,1.340848,1.327662,1.329441,1.230941,1.27997,0.0,1.344518,...,1.387444,1.384014,1.375405,1.402143,1.351113,1.358447,1.403703,1.369811,1.369129,1.391101
9,1.380795,1.375002,1.374551,1.414214,1.414214,1.374926,1.375086,1.3432,1.344518,0.0,...,1.409391,1.398899,1.414214,1.387158,1.414214,1.414214,1.406034,1.405896,1.363466,1.407167


In [36]:
similarity_e_table = distances_e_df.rename_axis(None).rename_axis(None, axis=1).stack().reset_index()
similarity_e_table.columns = ["review1", "review2", "similarity"]
similarity_e_table["index"] = similarity_e_table.apply(lambda x: sort_indices(x.review1, x.review2), axis=1)

In [37]:
similarity_e_table.drop_duplicates("index", inplace=True)

In [38]:
top_5_most_similar_e = similarity_e_table[similarity_e_table.similarity > 0].sort_values(by="similarity", ascending=True).head(5)
top_5_most_similar_e

Unnamed: 0,review1,review2,similarity,index
7255393,1469,1471,2.107342e-08,1469.01471.0
7255392,1469,1470,2.107342e-08,1469.01470.0
7260331,1470,1471,2.107342e-08,1470.01471.0
17207674,3484,3682,0.123146,3484.03682.0
2612773,529,571,0.1276537,529.0571.0


In [39]:
print("Review 1:", review_all_lem[1469])
print("Review 2:", review_all_lem[1471])

Review 1: This picky eater love them ! I be a little hesitant to try these , especially after read such mixed review although overall they be positive . However , I like the idea that they be healthy than regular chip and I figure they would be great for school lunch , quick snack , etc . I decide to give them a try and figured if we do not like them , they could be donate . I order a mixed case the first time around to see which flavor we like more ( or less ) and think that would give us a good sampling . After decide we like most of the flavor , we decide to try other one as well. < br / > < br / > Despite the flavor preference , these chip do not taste like cardboard . At first taste , they be a little crunchier and thicker than you might expect . I think this be a good thing , because it mean they be not heavily-laden with oil . The 0.8 oz bag be 100 calorie each as well , which be much good than the other alternative and give you the enjoyment of chip without the extra calorie , 

## Part 3: Naive Bayes

In [40]:
documents = [
    ["Love this movie. Can’t wait!", "Yes"],
    ["I want to see this movie so bad.", "Yes"],
    ["This movie looks amazing.", "Yes"],
    ["Looks bad.", "No"],
    ["Hard pass to see this bad movie.", "No"],
    ["So boring!", "No"]
]

In [41]:
for document in documents:
    document[0] = document[0].lower().replace('.', '').replace('!', '')

In [42]:
corpus = set()
stop = ['to', 'this']

# Build corpus
for document in documents:
    text = document[0]
    class_value = document[1]
    for word in text.split():
        if word not in stop:
            corpus.add(word)

In [43]:
conditional_probabilities = pd.DataFrame(index=list(corpus), 
                                         columns=["likelihood_given_yes", "likelihood_given_no"])

### Prior

In [44]:
yes_documents = 0
no_documents = 0
for doc, label in documents:
    if label == "Yes":
        yes_documents += 1
    else:
        no_documents += 1
    
p_yes = yes_documents / (no_documents + yes_documents)
p_no = no_documents / (no_documents + yes_documents)
print(p_yes, p_no)

0.5 0.5


### Likelihood

In [45]:
for word in corpus:
    yes_documents_with_word = 0
    no_documents_with_word = 0
    for document in documents:
        document_class = document[1]
        if word in document[0].split():
            if document[1] == "Yes":
                yes_documents_with_word += 1
            else:
                no_documents_with_word += 1
    conditional_probabilities.loc[word, "likelihood_given_yes"] = yes_documents_with_word * 1.0 / yes_documents
    conditional_probabilities.loc[word, "likelihood_given_no"] = no_documents_with_word * 1.0 / no_documents
conditional_probabilities.head(10)

Unnamed: 0,likelihood_given_yes,likelihood_given_no
hard,0.0,0.333333
movie,1.0,0.333333
amazing,0.333333,0.0
wait,0.333333,0.0
i,0.333333,0.0
want,0.333333,0.0
see,0.333333,0.333333
boring,0.0,0.333333
can’t,0.333333,0.0
so,0.333333,0.333333


### Posterior

In [46]:
test_document = "This looks so bad."

In [47]:
from typing import Dict, Tuple
def get_likelihood(test_document: str, conditional_probabilities: Dict, stop)-> Tuple[float, float]:
    test_document = test_document.lower().replace('.', '').replace('!', '')
    likelihood_yes = 1
    likelihood_no = 1
    for word in test_document.split():
        if word not in stop:
            likelihood_yes = likelihood_yes * conditional_probabilities.loc[word, "likelihood_given_yes"]
            likelihood_no = likelihood_no * conditional_probabilities.loc[word, "likelihood_given_no"]
    return likelihood_yes, likelihood_no

In [48]:
likelihood_yes, likelihood_no = get_likelihood(test_document, conditional_probabilities, stop)

In [49]:
print(likelihood_yes, likelihood_no)

0.037037037037037035 0.07407407407407407


In [50]:
def get_posterior(likelihood_yes: float, likelihood_no: float, p_yes: float, p_no: float)-> float:
    posterior_yes = likelihood_yes * p_yes / (likelihood_yes * p_yes + likelihood_no * p_no)
    posterior_no = likelihood_no * p_no / (likelihood_yes * p_yes + likelihood_no * p_no)
    return posterior_yes, posterior_no

In [51]:
get_posterior(likelihood_yes, likelihood_no, p_yes, p_no)

(0.3333333333333333, 0.6666666666666666)

To sum up, this sentence is more likely a "not intent to buy" review.