In [1]:
import string
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# Loading and preprocessing data

##### We will first load data and do preprocessing steps by removinf unwanted words and stopwords

In [2]:
df = pd.read_csv("rt_reviews.csv", encoding='cp1252')

In [3]:
df.head()

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [4]:
stop_words = ["a", "an", "the", "and", "or", "but", "is", "am", "are", "was", "were", "be", "being", "been", "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should", "can", "could", "may", "might", "must"]

In [5]:
review=[]
for i in df.Review:
    a=[]
    
    for j in i.split():
        word = j.lower()
        word = re.sub(r'[^\w\d\s_]+', '', word)
        if word not in stop_words:
            a.append(word)
    s = " ".join(a)
    s=s.strip()
    review.append(s)



In [6]:
df.Review = review
df.Review.head()

0    manakamana doesnt answer any questions yet mak...
1    wilfully offensive powered by chestthumping ma...
2    it difficult to imagine material more wrong fo...
3    despite gusto its star brings to role its hard...
4    if there good idea at core of this film its bu...
Name: Review, dtype: object

## Dividing dataset into 3 categories

###### Here we divide dataset into training, testing and development sets

In [7]:
len_of_dataset = len(df)
train_size = int(len_of_dataset*0.6)
validation_size = int(len_of_dataset*0.2)
test_size = int(len_of_dataset*0.2)

In [8]:
train_data = df[:train_size]
test_data = df[train_size:train_size+test_size]
validation_data = df[train_size+test_size:]

#### Creating count for every word in training dataset review

In [9]:
word_count={}
for sentence in train_data.Review:
    for word in sentence.split():
        if word in word_count:
            word_count[word]+=1
        else:
            word_count[word]=1
words = [words for words,values in word_count.items() if values>=5]

###### Mapping word to a number and reverse 

In [10]:
index_to_word = {}
for n,i in enumerate(words):
    index_to_word[n] = i

word_to_index= {}
for n,word in index_to_word.items():
    word_to_index[word] = n

In [11]:
index_to_word

{0: 'doesnt',
 1: 'answer',
 2: 'any',
 3: 'questions',
 4: 'yet',
 5: 'makes',
 6: 'its',
 7: 'point',
 8: 'like',
 9: 'rest',
 10: 'of',
 11: 'our',
 12: 'planet',
 13: 'picturesque',
 14: 'far',
 15: 'from',
 16: 'kingdom',
 17: 'wilfully',
 18: 'offensive',
 19: 'powered',
 20: 'by',
 21: 'chestthumping',
 22: 'machismo',
 23: 'good',
 24: 'clean',
 25: 'fun',
 26: 'it',
 27: 'difficult',
 28: 'to',
 29: 'imagine',
 30: 'material',
 31: 'more',
 32: 'wrong',
 33: 'for',
 34: 'spade',
 35: 'than',
 36: 'lost',
 37: 'found',
 38: 'despite',
 39: 'gusto',
 40: 'star',
 41: 'brings',
 42: 'role',
 43: 'hard',
 44: 'ride',
 45: 'shotgun',
 46: 'on',
 47: 'hectors',
 48: 'voyage',
 49: 'discovery',
 50: 'if',
 51: 'there',
 52: 'idea',
 53: 'at',
 54: 'core',
 55: 'this',
 56: 'film',
 57: 'buried',
 58: 'in',
 59: 'unsightly',
 60: 'pile',
 61: 'flatulence',
 62: 'jokes',
 63: 'bad',
 64: 'puns',
 65: 'ridiculous',
 66: 'serial',
 67: 'plot',
 68: 'gleeson',
 69: 'goes',
 70: 'hallmark'

In [12]:
word_to_index

{'doesnt': 0,
 'answer': 1,
 'any': 2,
 'questions': 3,
 'yet': 4,
 'makes': 5,
 'its': 6,
 'point': 7,
 'like': 8,
 'rest': 9,
 'of': 10,
 'our': 11,
 'planet': 12,
 'picturesque': 13,
 'far': 14,
 'from': 15,
 'kingdom': 16,
 'wilfully': 17,
 'offensive': 18,
 'powered': 19,
 'by': 20,
 'chestthumping': 21,
 'machismo': 22,
 'good': 23,
 'clean': 24,
 'fun': 25,
 'it': 26,
 'difficult': 27,
 'to': 28,
 'imagine': 29,
 'material': 30,
 'more': 31,
 'wrong': 32,
 'for': 33,
 'spade': 34,
 'than': 35,
 'lost': 36,
 'found': 37,
 'despite': 38,
 'gusto': 39,
 'star': 40,
 'brings': 41,
 'role': 42,
 'hard': 43,
 'ride': 44,
 'shotgun': 45,
 'on': 46,
 'hectors': 47,
 'voyage': 48,
 'discovery': 49,
 'if': 50,
 'there': 51,
 'idea': 52,
 'at': 53,
 'core': 54,
 'this': 55,
 'film': 56,
 'buried': 57,
 'in': 58,
 'unsightly': 59,
 'pile': 60,
 'flatulence': 61,
 'jokes': 62,
 'bad': 63,
 'puns': 64,
 'ridiculous': 65,
 'serial': 66,
 'plot': 67,
 'gleeson': 68,
 'goes': 69,
 'hallmark': 70

# Probability

In [11]:
num_fresh_data = train_data.Freshness.value_counts()["fresh"]
num_rotten_data = train_data.Freshness.value_counts()["rotten"]

##### Calculating probability of wach word

In [12]:
all_probability = {}

for i in word_count:
    prob = word_count[i]/len(train_data)
    all_probability[i] = prob

In [13]:
fresh_probability = num_fresh_data/len(train_data)
rotten_probability = num_rotten_data/len(train_data)

In [14]:
all_probability

{'manakamana': 1.388888888888889e-05,
 'doesnt': 0.029291666666666667,
 'answer': 0.0013819444444444445,
 'any': 0.025451388888888888,
 'questions': 0.003232638888888889,
 'yet': 0.015996527777777776,
 'makes': 0.02439236111111111,
 'its': 0.26983680555555556,
 'point': 0.00960763888888889,
 'nepal': 1.0416666666666666e-05,
 'like': 0.0732326388888889,
 'rest': 0.0036006944444444446,
 'of': 0.6430034722222222,
 'our': 0.011840277777777778,
 'planet': 0.0012847222222222223,
 'picturesque': 0.00018402777777777778,
 'far': 0.014479166666666666,
 'from': 0.07544444444444444,
 'peaceable': 1.0416666666666666e-05,
 'kingdom': 0.0007673611111111111,
 'wilfully': 7.638888888888889e-05,
 'offensive': 0.0013125,
 'powered': 0.00023958333333333332,
 'by': 0.07953125,
 'chestthumping': 2.777777777777778e-05,
 'machismo': 0.00019444444444444443,
 'good': 0.038322916666666665,
 'clean': 0.0006493055555555555,
 'fun': 0.021520833333333333,
 'it': 0.26091319444444444,
 'difficult': 0.00358333333333333

##### Calculating conditional probability of each word for fresh and rotten

In [15]:
fresh_words_conditional={x: 0 for x in words}
rotten_words_conditional={x: 0 for x in words}
for i,j in tqdm(train_data.iterrows(),total = len(train_data)):
    for k in j["Review"].split():
        if k in words:
            if j["Freshness"] == "fresh":
                fresh_words_conditional[k] += 1
            else:
                rotten_words_conditional[k] += 1

100%|█████████████████████████████████| 288000/288000 [02:31<00:00, 1904.88it/s]


In [16]:
fresh_probability_words = {}
rotten_probability_words = {}
for i in fresh_words_conditional:
    prob = fresh_words_conditional[i]/num_fresh_data
    fresh_probability_words[i] = prob

for i in rotten_words_conditional:
    prob = rotten_words_conditional[i]/num_rotten_data
    rotten_probability_words[i] = prob

In [41]:
for i in range(10):
    word = words[i]
    print(f"\tP({word}|fresh):", fresh_probability_words[word])
    print(f"\tP({word}|rotten):", rotten_probability_words[word])

	P(doesnt|fresh): 0.021634031809448503
	P(doesnt|rotten): 0.03696751002927087
	P(answer|fresh): 0.0012415811779067911
	P(answer|rotten): 0.0015226414700790522
	P(any|fresh): 0.019310402230684396
	P(any|rotten): 0.03160697773049941
	P(questions|fresh): 0.004078490126308342
	P(questions|rotten): 0.00238477636637952
	P(yet|fresh): 0.018380950399178753
	P(yet|rotten): 0.013606435419838836
	P(makes|fresh): 0.02888930506135076
	P(makes|rotten): 0.019884724221123695
	P(its|fresh): 0.27544374388746695
	P(its|rotten): 0.26421653491298697
	P(point|fresh): 0.006721185259171401
	P(point|rotten): 0.012500955996356785
	P(like|fresh): 0.06001900520909198
	P(like|rotten): 0.08647769225955823
	P(rest|fresh): 0.0025525244327916155
	P(rest|rotten): 0.004651356819556557


# Smoothing

### Adding Lidstone smoothing to the word counts

In [18]:
gama=1
smooth_fresh_words = {}
smooth_rotten_words={}
for i in fresh_words_conditional:
    smooth = (fresh_words_conditional[i]+gama)/(num_fresh_data + gama * len(words))
    smooth_fresh_words[i] = smooth

for i in rotten_words_conditional:
    smooth = (rotten_words_conditional[i]+gama)/(num_rotten_data + gama * len(words))
    smooth_rotten_words[i] = smooth

In [42]:
for i in range(10):
    word = words[i]
    print(f"\tP({word}|fresh):", smooth_fresh_words[word])
    print(f"\tP({word}|rotten):", smooth_rotten_words[word])

	P(doesnt|fresh): 0.017296242502189748
	P(doesnt|rotten): 0.029537224234076113
	P(answer|fresh): 0.0009978601443571009
	P(answer|rotten): 0.0012219235298038258
	P(any|fresh): 0.01543911390019181
	P(any|rotten): 0.02525493768189998
	P(questions|fresh): 0.00326522013903518
	P(questions|rotten): 0.001910644064784164
	P(yet|fresh): 0.014696262459392637
	P(yet|rotten): 0.010875119415254049
	P(makes|fresh): 0.023094918674398234
	P(makes|rotten): 0.01589056008531248
	P(its|fresh): 0.22015012251505106
	P(its|rotten): 0.21107618137788542
	P(point|fresh): 0.005377357444591044
	P(point|rotten): 0.009992001955077648
	P(like|fresh): 0.04797489827370195
	P(like|rotten): 0.06908866721468085
	P(rest|fresh): 0.002045613295932057
	P(rest|rotten): 0.0037213125680389237


# Dev Data


#### Predicting class for Development data without smoothing

In [20]:
prediction = []

for sentence in validation_data["Review"]:
    new_fresh_prob = fresh_probability
    new_rotten_prob = rotten_probability
    for word in sentence.split():
        if word in word_to_index:
            index = word_to_index[word]
            new_fresh_prob*= fresh_probability_words[word]
            new_rotten_prob*=rotten_probability_words[word]
    if new_fresh_prob>new_rotten_prob:
        prediction.append("fresh")
    else:
        prediction.append("rotten")

In [43]:
accuracy = sum(prediction == validation_data.Freshness)/len(validation_data)
print(f"Accuracy without smoothing = {accuracy}")

Accuracy without smoothing = 0.7934375


#### Predicting class for Development data with smoothing

In [45]:
prediction_smooth = []

for sentence in validation_data["Review"]:
    new_fresh_prob = fresh_probability+len(fresh_words_conditional)
    new_rotten_prob = rotten_probability+len(rotten_words_conditional)
    for word in sentence.split():
        if word in word_to_index:
            index = word_to_index[word]
            new_fresh_prob*= (fresh_words_conditional[word]+1)/(num_fresh_data+len(fresh_words_conditional))
            new_rotten_prob*=(rotten_words_conditional[word]+1)/(num_rotten_data+len(fresh_words_conditional))
    if new_fresh_prob>new_rotten_prob:
        prediction_smooth.append("fresh")
    else:
        prediction_smooth.append("rotten")

In [46]:
accuracy = sum(prediction_smooth == validation_data.Freshness)/len(validation_data)
print(f"Accuracy With smoothing = {accuracy}")

Accuracy With smoothing = 0.7950416666666666


In [47]:
print("We can observe from above that after smoothing the accuracy of development data increased slightly")

We can observe from above that after smoothing the accuracy of development data increased slightly


# Top 10 Words

In [25]:
fresh_words = {x:0 for x in words}
probs = np.array(list(smooth_fresh_words.values()))/np.array(list(smooth_rotten_words.values()))
for i,j in enumerate(fresh_words):
    fresh_words[j] = probs[i]
rotten_words = {x:0 for x in words}
probs = np.array(list(smooth_rotten_words.values()))/np.array(list(smooth_fresh_words.values()))
for i,j in enumerate(fresh_words):
    rotten_words[j] = probs[i]
    

In [26]:
fresh_top_10  = sorted(fresh_words, key=lambda x: fresh_words[x],reverse=True)[:10]
rotten_top_10 = sorted(rotten_words, key=lambda x: rotten_words[x],reverse=True)[:10]

In [34]:
print(f"Top 10 Fresh Words {fresh_top_10}\n\n\nTop 10 Rotten words{rotten_top_10}")

Top 10 Fresh Words ['spiderverse', 'unmissable', 'nimbly', 'ida', 'unadorned', 'koreedas', 'tonic', 'captivates', 'cannily', 'langs']


Top 10 Rotten words['charmless', 'mirthless', 'squanders', 'flavorless', 'unexciting', 'feeble', 'lifeless', 'thirdrate', 'unfunny', 'drearily']


In [52]:
for i in fresh_top_10:
    if i in words:
        print(f"Probability[Fresh|{i}] = {smooth_fresh_words[i]/(smooth_fresh_words[i]+smooth_rotten_words[i])}")
        print(f"Probability[Rotten|{i}] = {smooth_rotten_words[i]/(smooth_fresh_words[i]+smooth_rotten_words[i])}")
        print("\n")

Probability[Fresh|spiderverse] = 0.9755645580566952
Probability[Rotten|spiderverse] = 0.024435441943304832


Probability[Fresh|unmissable] = 0.9736355406463651
Probability[Rotten|unmissable] = 0.02636445935363476


Probability[Fresh|nimbly] = 0.9729230242705986
Probability[Rotten|nimbly] = 0.027076975729401365


Probability[Fresh|ida] = 0.9713758524834327
Probability[Rotten|ida] = 0.02862414751656727


Probability[Fresh|unadorned] = 0.9676826403970399
Probability[Rotten|unadorned] = 0.0323173596029601


Probability[Fresh|koreedas] = 0.9666054632814863
Probability[Rotten|koreedas] = 0.033394536718513675


Probability[Fresh|tonic] = 0.9666054632814863
Probability[Rotten|tonic] = 0.033394536718513675


Probability[Fresh|captivates] = 0.9666054632814863
Probability[Rotten|captivates] = 0.033394536718513675


Probability[Fresh|cannily] = 0.9654540029546397
Probability[Rotten|cannily] = 0.034545997045360266


Probability[Fresh|langs] = 0.9654540029546397
Probability[Rotten|langs] = 0.0345459

In [53]:
for i in rotten_top_10:
    if i in words:
        print(f"Probability[Rotten|{i}] = {smooth_rotten_words[i]/(smooth_fresh_words[i]+smooth_rotten_words[i])}")
        print(f"Probability[Fresh|{i}] = {smooth_fresh_words[i]/(smooth_fresh_words[i]+smooth_rotten_words[i])}")
        print("\n")

Probability[Rotten|charmless] = 0.9930201587479303
Probability[Fresh|charmless] = 0.006979841252069793


Probability[Rotten|mirthless] = 0.9824888198987324
Probability[Fresh|mirthless] = 0.017511180101267693


Probability[Rotten|squanders] = 0.9784184868571637
Probability[Fresh|squanders] = 0.02158151314283632


Probability[Rotten|flavorless] = 0.9778189751224217
Probability[Fresh|flavorless] = 0.022181024877578344


Probability[Rotten|unexciting] = 0.9750462155884785
Probability[Fresh|unexciting] = 0.024953784411521508


Probability[Rotten|feeble] = 0.9741856980703746
Probability[Fresh|feeble] = 0.025814301929625427


Probability[Rotten|lifeless] = 0.9719268262013655
Probability[Fresh|lifeless] = 0.028073173798634457


Probability[Rotten|thirdrate] = 0.9714811961284731
Probability[Fresh|thirdrate] = 0.028518803871526943


Probability[Rotten|unfunny] = 0.971234622392189
Probability[Fresh|unfunny] = 0.028765377607810973


Probability[Rotten|drearily] = 0.9706423610035381
Probability[Fre

# Test Data

In [49]:
prediction_smooth_test = []

for sentence in test_data["Review"]:
    new_fresh_prob = fresh_probability+len(fresh_words_conditional)
    new_rotten_prob = rotten_probability+len(rotten_words_conditional)
    for word in sentence.split():
        if word in word_to_index:
            index = word_to_index[word]
            new_fresh_prob*= (fresh_words_conditional[word]+1)/(num_fresh_data+len(fresh_words_conditional))
            new_rotten_prob*=(rotten_words_conditional[word]+1)/(num_rotten_data+len(fresh_words_conditional))
    if new_fresh_prob>new_rotten_prob:
        prediction_smooth_test.append("fresh")
    else:
        prediction_smooth_test.append("rotten")

In [50]:
accuracy = sum(prediction_smooth_test == test_data.Freshness)/len(test_data)
print(f"Accuracy of Testing data with optimal parameters is {accuracy}")

Accuracy of Testing data with optimal parameters is 0.7960208333333333
