In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 1. Read training data (reviews) and training results (ratings)

In [2]:
with open('../Datasets/Movie Rating Prediction/imdb_trainX.txt') as f:
    reviews = f.readlines()

In [3]:
train_ratings = pd.read_csv('../Datasets/Movie Rating Prediction/imdb_trainY.txt', header=None)

In [4]:
print(train_ratings.shape)
train_ratings.head()

(25000, 1)


Unnamed: 0,0
0,10
1,8
2,7
3,8
4,8


In [5]:
print(len(reviews), type(reviews))

25000 <class 'list'>


In [6]:
print(reviews[0])

I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language.



In [7]:
# print(reviews[1])
print(len(reviews[1]))

2970


### 2. Clean (tokenize, remove stopwords, and stem) reviews

In [8]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [9]:
tk = RegexpTokenizer(r'\w+')
sw = stopwords.words('english')
ss = SnowballStemmer('english')
ls = LancasterStemmer()
ps = PorterStemmer()
l = WordNetLemmatizer()

In [10]:
def clean_review(review):
    review = review.lower()
    review = review.replace("<br />", " ")
    
    words = tk.tokenize(review)
    filt_words = [w for w in words if (w not in sw or w == 'not')]
#     stemmed_words = [ss.stem(w) for w in filt_words]
    stemmed_words = [l.lemmatize(w) for w in filt_words]
    
    clean_rev = ' '.join(stemmed_words)
    return clean_rev

In [11]:
print(reviews[0])
print(clean_review(reviews[0]))
print(len(clean_review(reviews[1])))

I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language.

loved movie since 7 saw opening day touching beautiful strongly recommend seeing movie watch family far mpaa rating pg 13 thematic element prolonged scene disastor nudity sexuality language
1849


In [12]:
clean_reviews = [clean_review(x) for x in reviews]
print(len(clean_reviews))

25000


In [13]:
print(reviews[321])
print()
print(clean_reviews[321])

My observations: vamp outfit at end is ravishing and wonderful, exotic and fantastic. Jeanette wore it well, and got even with naive Nelson. Boat crashing into his balcony served him right. Costume outfits of his female mafia were designed surprisingly well, especially by today's standards. 1942 costume designer did great job. Main song theme just lovely.<br /><br />Caution to negative posters: 1942 was time of WW II; Pearl Harbor happened year before. U.S. just coming out of Great Depression; needed to get out and spend that hard earned money on diversion of singing, dance and yes, fantastic fantasy. Despotic dictators were trying to rule out there in RL, snuffing out freedoms. Thank goodness the public had these fantastic plot line movies to attend. Movie going was a privileged treat, in those depressing times. When you, negative posters, become actors or even movie stars, then YOU have room to talk and criticize. Jeanette's and Nelson's movies stand the test of time.<br /><br />Ange

### 3. Vectorize

In [None]:
# Create vocab
vocab = {}

for r in clean_reviews:
    words = tk.tokenize(r)
    for w in words:
        if vocab.get(w):
            ov = vocab.get(w)
            vocab[w] = ov+1
        else:
            vocab[w] = 1

In [None]:
print(len(vocab))
# print(vocab)

In [None]:
x = []
x.extend([[1,2]])
x.extend([[3,4]])
print(x)

In [None]:
# Create frequency vector

freq = []
for r in clean_reviews:
    f = []
    words = tk.tokenize(r)
    for k in vocab.keys():
        f.extend([r.count(k)])
    freq.extend([f])

In [None]:
len(freq[0])

In [None]:
freq = np.array(freq)
print(freq.shape)

In [13]:
# X = freq
Y = train_ratings.values.reshape((-1,))

In [14]:
print(type(Y), (Y.shape))

<class 'numpy.ndarray'> (25000,)


In [15]:
print(Y.reshape((-1,))[1])

8


### ScikitLearn Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [None]:
mnb = MultinomialNB()
# gnb = GaussianNB()
# bnb = BernoulliNB()

In [None]:
mnb.fit(X, Y.reshape((-1,)))
# gnb.fit(X, Y.reshape((-1,)))
# bnb.fit(X, Y.reshape((-1,)))

In [None]:
print(mnb.score(X, Y))
print(gnb.score(X, Y))
print(bnb.score(X, Y))

In [None]:
mnb.predict(freq[1].reshape((1,-1)))

In [None]:
print(Y[1])

In [16]:
with open('../Datasets/Movie Rating Prediction/imdb_testX.txt') as ff:
    testers = ff.readlines()

In [17]:
test_y = pd.read_csv("../Datasets/Movie Rating Prediction/imdb_testY.txt", header=None)

In [18]:
y_test = test_y.values

In [20]:
print(testers[503])

The problem with so many people watching this movie is the mindset they watch it in. People come looking for a B-Grade horror film, or a "So Bad It's Good" movie. Jack Frost 2 is neither of these.<br /><br />It is, to put it simply, a very good movie cleverly hidden inside a very bad one. To view it as anything other than a screwball comedy (easily funnier than all three absolutely meritless "Scary Movies" combined) is to misinterpret the movie on a basic level. It would be like watching Shawshank Redemption and then complaining that there were no explosions.<br /><br />The premise is simple; the characters from the first movie, haunted by memories of Jack Frost, take a vacation to a tropical island. A new, improved Jack comes after them, now with essentially the powers of Hydro-Man from Spider-Man; essentially, he can turn from water to snow easily and quickly, divide himself, multiply himself, and, worst of all, he's managed to grow an immunity to his only former weakness...AntiFreez

In [19]:
print(len(testers))

25000


In [20]:
print(y_test.shape)
y_test = y_test.reshape((-1,))
print(y_test.shape)

(25000, 1)
(25000,)


In [21]:
clean_test = [clean_review(x) for x in testers]
print(len(clean_test))

25000


In [None]:
clean_test[503]

In [None]:
test_freq = []
for rt in clean_test:
    f1 = []
    words = tk.tokenize(rt)
    for k in vocab.keys():
        f1.extend([rt.count(k)])
    test_freq.extend([f1])

In [None]:
test_freq = np.array(test_freq)
print(test_freq.shape)

In [None]:
X_test = test_freq
Y_test = y_test
print(X_test.shape, Y_test.shape)

In [None]:
# res = mnb.predict(X_test)
# np.sum((res == Y_test)/Y_test.shape[0])

In [None]:
mnb.score(X_test, Y_test)

### Naive Bayes Implementation


$ P(Y|X) = \frac{P(Y) P(X|Y)}{P(X)} = \frac{P(Y) \Pi_{i=1}^{n} P(x_i | y = c)}{P(X)} $ 

where, 
- P(Y) is prior probability
- P(X|Y) is conditional probability

In [22]:
def prior_prob(y_train, label):
    total_examples = y_train.shape[0]
    label_examples = np.sum(y_train == label)
    return np.log(label_examples/float(total_examples))

In [23]:
# print("Rating = 1:", prior_prob(Y, 1))
# print("Rating = 2:", prior_prob(Y, 2))
# print("Rating = 3:", prior_prob(Y, 3))
# print("Rating = 4:", prior_prob(Y, 4))
# print("Rating = 5:", prior_prob(Y, 5))
# print("Rating = 6:", prior_prob(Y, 6))
# print("Rating = 7:", prior_prob(Y, 7))
# print("Rating = 8:", prior_prob(Y, 8))
# print("Rating = 9:", prior_prob(Y, 9))
# print("Rating = 10:", prior_prob(Y, 10))

In [24]:
def review_classifier(reviews):
    review_class = [[], [], [], [], [], [], [], [], [], []]
    for i in range(len(reviews)):
        review_class[-1+Y[i]].extend([reviews[i]])
    return review_class

In [25]:
review_class = review_classifier(clean_reviews)

In [26]:
classwise_vocab = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
sum_classwise_vocab = []

def create_class_vocab(review_class):  
    for c in range(1, 11):
        for review in review_class[-1+c]:
            words = tk.tokenize(review)
            for w in words:
                if classwise_vocab[-1+c].get(w):
                    ov = classwise_vocab[-1+c].get(w)
                    classwise_vocab[-1+c][w] = ov+1
                else:
                    classwise_vocab[-1+c][w] = 1
        sum_classwise_vocab.extend([np.sum(list(classwise_vocab[-1+c].values()))])

In [27]:
create_class_vocab(review_class)

In [28]:
def word_count(xi, c):
    if classwise_vocab[-1+c].get(xi) == None:
        return 0
    else:
        return classwise_vocab[-1+c][xi]

In [29]:
def cond_prob(review, c):
    prod = 0.0
    words = tk.tokenize(review)

    for word in words:
        wc = word_count(word, c)
#         print(wc)
        sum_vocab_count = sum_classwise_vocab[-1+c] + len(classwise_vocab[-1+c])
        if sum_vocab_count != 0:
            prod = prod + np.log((wc+1)/float(sum_vocab_count))
#         print(prod)
    return prod

In [30]:
prior_probs = []
for i in range(1, 11):
    prior_probs.append(prior_prob(Y, i))
    
print(prior_probs)

[-1.5896352851379207, -2.392947533074437, -2.3351082846996056, -2.22710663181814, -inf, -inf, -2.3041863743610196, -2.117268027220293, -2.4021844582168006, -1.6645278787520603]


  after removing the cwd from sys.path.


In [31]:
def predict(review):
#     clean_rev = clean_review(review)
    posterior_probs = []
#     classes = np.unique(Y)
    
    for c in range(1,11):
        if c == 5 or c == 6:
            posterior_probs.append(-np.inf)
            continue
        prior = prior_probs[-1+c]
        cond = cond_prob(review, c)
#         print(cond)
        post = prior + cond
        posterior_probs.append(post)
    
    posterior_probs = np.array(posterior_probs)
#     print(posterior_probs)
    pred = np.argmax(posterior_probs)
    return pred+1                

In [32]:
predict(clean_reviews[1])

8

In [33]:
# Accuracy on Training Set
preds = []
for i in range(Y.shape[0]):
    pred = predict(clean_reviews[i])
    preds.append(pred)
preds = np.array(preds)
accuracy = np.sum(preds == Y)/Y.shape[0]
print(accuracy*100)
# 70.364 ps
# 74.328 ss
# 75.856 lemma & add |V|
# 71.44 ss + |V|

75.856


In [34]:
preds[:100]

array([10,  8,  3,  8, 10,  8, 10,  3, 10,  1,  8,  8, 10,  8,  8,  8, 10,
        8,  8,  7,  8,  7,  8,  9,  8, 10, 10, 10,  7,  9,  9,  7,  1, 10,
       10, 10,  3,  9, 10,  1, 10,  9, 10,  8,  8, 10,  9, 10, 10,  8,  8,
        7,  8,  8,  7,  7, 10, 10,  7, 10,  2,  9, 10,  8,  7, 10, 10, 10,
        7, 10, 10,  7,  8,  8, 10,  1, 10,  8, 10, 10,  9,  9,  8, 10,  9,
        7,  9, 10, 10,  7,  7, 10,  1,  7,  1, 10,  8,  8, 10, 10])

In [35]:
Y[:100]

array([10,  8,  7,  8,  8,  8, 10,  9, 10,  8,  8,  8, 10,  8,  8,  8,  9,
       10,  7,  7,  8,  7,  8,  9,  8,  7, 10, 10,  7,  9,  9,  7, 10, 10,
       10, 10,  7,  9, 10,  9, 10,  9, 10,  8,  8, 10,  8, 10,  8,  8,  8,
        7,  8,  8,  7,  7, 10, 10,  7, 10,  7,  9, 10,  8,  7, 10, 10, 10,
        7, 10, 10,  8,  8,  8, 10, 10, 10,  8, 10, 10,  9,  9,  8,  8,  9,
        7,  9, 10, 10,  7,  7, 10, 10,  7,  7, 10,  8,  8, 10, 10])

In [36]:
Y[3]

8

In [37]:
# Accuracy on Test Set
preds = []
# clean_test = clean_review(testers)
for i in range(y_test.shape[0]):
    pred = predict(clean_test[i])
    preds.append(pred)
preds = np.array(preds)
accuracy = np.sum(preds == y_test)/y_test.shape[0]
print(accuracy*100)
# 33.64 ps
# 33.739999999999995 ss
# 35.504000000000005

35.532000000000004


In [38]:
np.unique(Y)

array([ 1,  2,  3,  4,  7,  8,  9, 10])

In [39]:
# Accuracy on predicting Random Value on test set
acc = []
for i in range(1, 11):
    acc.extend([np.array([i]*25000)])
print(acc)

[array([1, 1, 1, ..., 1, 1, 1]), array([2, 2, 2, ..., 2, 2, 2]), array([3, 3, 3, ..., 3, 3, 3]), array([4, 4, 4, ..., 4, 4, 4]), array([5, 5, 5, ..., 5, 5, 5]), array([6, 6, 6, ..., 6, 6, 6]), array([7, 7, 7, ..., 7, 7, 7]), array([8, 8, 8, ..., 8, 8, 8]), array([9, 9, 9, ..., 9, 9, 9]), array([10, 10, 10, ..., 10, 10, 10])]


In [40]:
for i in range(1, 11):
    print("Rating %d =>"%i, 100*np.sum(acc[i-1] == y_test)/y_test.shape[0])

Rating 1 => 20.088
Rating 2 => 9.208
Rating 3 => 10.164
Rating 4 => 10.54
Rating 5 => 0.0
Rating 6 => 0.0
Rating 7 => 9.228
Rating 8 => 11.4
Rating 9 => 9.376
Rating 10 => 19.996


In [41]:
print(np.unique(y_test, return_counts=True))
print(np.unique(preds, return_counts=True))

(array([ 1,  2,  3,  4,  7,  8,  9, 10]), array([5022, 2302, 2541, 2635, 2307, 2850, 2344, 4999]))
(array([ 1,  2,  3,  4,  7,  8,  9, 10]), array([6118, 3570, 2548, 1997, 1954, 1453, 3114, 4246]))


In [42]:
for i in range(10):
    print(len(review_class[i]), len(classwise_vocab[i]))

5100 30201
2284 22729
2420 24729
2696 26925
0 0
0 0
2496 26568
3009 28441
2263 24473
4732 30781


In [43]:
type(classwise_vocab[1])

dict