In [251]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

# Q1: Explore and clean the data

In [252]:
#Reading the data
rotten = pd.read_csv('rotten-tomatoes.csv.bz2')

__1.1 Take a look at a few lines of data__

In [253]:
#Validating the data
rotten.head()

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709,http://www.timeout.com/film/reviews/87745/toy-...,Time Out,"So ingenious in concept, design and execution ...",2009-10-04 00:00:00,9559,Toy Story
1,Richard Corliss,fresh,114709,"http://www.time.com/time/magazine/article/0,91...",TIME Magazine,The year's most inventive comedy.,2008-08-31 00:00:00,9559,Toy Story
2,David Ansen,fresh,114709,http://www.newsweek.com/id/104199,Newsweek,A winning animated feature that has something ...,2008-08-18 00:00:00,9559,Toy Story
3,Leonard Klady,fresh,114709,http://www.variety.com/review/VE1117941294.htm...,Variety,The film sports a provocative and appealing st...,2008-06-09 00:00:00,9559,Toy Story
4,Jonathan Rosenbaum,fresh,114709,http://onfilm.chicagoreader.com/movies/capsule...,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10 00:00:00,9559,Toy Story


__1.2 Print out all variable names.__

In [254]:
#Printing out the variable names
rotten.columns

Index(['critic', 'fresh', 'imdb', 'link', 'publication', 'quote',
       'review_date', 'rtid', 'title'],
      dtype='object')

In [255]:
#Checking the shape
rotten.shape

(13442, 9)

__1.3 Create a summary table__

In [256]:
# Part a
missing_fresh = rotten.fresh.isnull().sum()
missing_quote = rotten.quote.isnull().sum()
print("PART A")
print("Number of missings in fresh:", missing_fresh)
print("Number of missings in quote:", missing_quote,"\n")

# Part b
values = rotten.fresh.value_counts()
print("Part B")
print("All different values in fresh:\n",values, "\n")


# Part c
percentage_fresh = (8389 * 100)/(8389 + 5030)
percentage_rotten = (5030/(8389+5030))*100

print("Part C")
print("percentage of fresh:\n", percentage_fresh)
print("percentage of rotten:\n", percentage_rotten, "\n")


# Part d
# Check white spaces
print("PART D")
print("Number of white spaces quotes are:", sum(rotten.quote == '\\s+'))
print("Number of zero length quotes are:", len(rotten.quote) == 0, "\n")

# Part e
# Find minimum-maximum-average length of quotes
min_len = rotten.quote.str.len().min()
max_len = rotten.quote.str.len().max()
mean_len = rotten.quote.str.len().mean()

print("Part E")
print("Number of minimum length quotes:", min_len)
print("Number of maximum length quotes:", max_len)
print("Number of average length quotes:", mean_len, "\n")

# Part f
print("Part F")
print("Number of reviews are in data multiple times:", rotten.duplicated(subset=None, keep='first').sum())

PART A
Number of missings in fresh: 0
Number of missings in quote: 0 

Part B
All different values in fresh:
 fresh     8389
rotten    5030
none        23
Name: fresh, dtype: int64 

Part C
percentage of fresh:
 62.51583575527238
percentage of rotten:
 37.48416424472762 

PART D
Number of white spaces quotes are: 0
Number of zero length quotes are: False 

Part E
Number of minimum length quotes: 4
Number of maximum length quotes: 256
Number of average length quotes: 121.23128998660914 

Part F
Number of reviews are in data multiple times: 596


__1.4 Clean the data__

In [257]:
#Making a funtion to clean the data
def clean(df):
    df.drop_duplicates(subset = None, keep = 'first', inplace = True)
    df.drop(df[df.fresh == 'none'].index, inplace=True)
    df.drop(list(df['quote'].filter(regex = '\\s+')), axis = 1, inplace = True)
    df.drop(df[df["quote"].str.len() == 0].index, axis = 1, inplace = True)

In [258]:
#Cleaning our data
clean(rotten)

In [259]:
#Checking the final number of rows and columns
rotten.shape

(12823, 9)

# Q2: Naïve Bayes 

__Split your data into training and validation and convert your data into bag of words__

In [260]:
#Splitting data into training, validation and test
target = rotten.fresh

X_train, X_test, y_train, y_test = train_test_split(rotten, target, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(10258, 9)
(10258,)
(2565, 9)


In [261]:
X_train.head()

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
13092,Trevor Johnston,fresh,151568,http://www.timeout.com/film/reviews/79592/tops...,Time Out,"Leigh's cast are beyond compare, and the whole...",2006-02-09 00:00:00,13407,Topsy-Turvy
12421,Variety Staff,rotten,94012,http://www.variety.com/review/VE1117795083.htm...,Variety,Mel Brooks will do anything for a laugh. Unfor...,2009-03-26 00:00:00,10223,Spaceballs
7984,,rotten,88847,http://www.timeout.com/film/reviews/68404/the_...,Time Out,"An iconic movie of the '80s, with all the unap...",2006-06-24 00:00:00,14900,The Breakfast Club
148,Hal Hinson,fresh,113161,http://www.washingtonpost.com/wp-srv/style/lon...,Washington Post,Irresistibly charming.,2000-01-01 00:00:00,13680,Get Shorty
1725,James Berardinelli,rotten,110475,http://www.reelviews.net/php_review_template.p...,ReelViews,"As a comedy, The Mask is genial, but its recyc...",2000-01-01 00:00:00,11568,The Mask


In [263]:
y_train.head()

13092     fresh
12421    rotten
7984     rotten
148       fresh
1725     rotten
Name: fresh, dtype: object

In [80]:
y_test.head()

3769    rotten
1279     fresh
6066    rotten
246      fresh
7672     fresh
Name: fresh, dtype: object

In [264]:
#Validating the data
X_test.tail()

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
8281,Roger Ebert,rotten,97523,http://www.rogerebert.com/reviews/honey-i-shru...,Chicago Sun-Times,"The special effects are all there, nicely in p...",2000-01-01 00:00:00,10611,"Honey, I Shrunk the Kids"
2810,James Berardinelli,fresh,116282,http://www.reelviews.net/php_review_template.p...,ReelViews,It's easy to admire what the Coens are trying ...,2000-01-01 00:00:00,104762093,Fargo
12013,Variety Staff,fresh,57076,http://www.variety.com/review/VE1117791143.htm...,Variety,"From Russia with Love is a preposterous, skill...",2008-04-15 00:00:00,10737,From Russia With Love
1672,Mick LaSalle,rotten,110167,http://www.sfgate.com/cgi-bin/article.cgi?f=/c...,San Francisco Chronicle,The nastiness in the movie feels derived from ...,2000-01-01 00:00:00,11028,It Could Happen to You
12767,Roger Ebert,fresh,101393,http://www.rogerebert.com/reviews/backdraft-1991,Chicago Sun-Times,[The scenes involving fire] are so good they m...,2000-01-01 00:00:00,13033,Backdraft


In [265]:
#Validating the test data
y_test.tail()

8281     rotten
2810      fresh
12013     fresh
1672     rotten
12767     fresh
Name: fresh, dtype: object

In [266]:
#Creating bag of words for fresh and rotten

f = X_train[X_train['fresh']=='fresh']
r = X_train[X_train['fresh']=='rotten']

#Fresh
vectorizer1 = CountVectorizer(binary=True, stop_words='english')
X = vectorizer1.fit_transform(f.quote.values)
words = vectorizer1.get_feature_names()

#Rotten
vectorizer2 = CountVectorizer(binary=True, stop_words='english')
X2 = vectorizer2.fit_transform(r.quote.values)
words2 = vectorizer2.get_feature_names()

__2.4 Compute the unconditional (log) probability that the tomato is fresh/rotten, log Pr(F), and log Pr(R).__

In [267]:
Pr_f = np.log(len(rotten[rotten['fresh'] == 'fresh']) / len(rotten['fresh'])) 
Pr_r = np.log(len(rotten[rotten['fresh'] == 'rotten']) / len(rotten['fresh']))
print(Pr_f)
print(Pr_r)

-0.4774397716872286
-0.968557258351698


In [269]:
X.shape[0] 

6375

__2.5 For each word w, compute log Pr(w|F) and log Pr(w|R), the (log) probability that the word is present in a fresh/rotten review__

In [271]:
#Making log probability matrix for Fresh and Rotten vectors
#Fresh
mat1 = np.sum(X, axis=0)
Pr_matrix1 = [[np.log(x/X.shape[0])] for x in mat1]

#Rotten
mat2 = np.sum(X2, axis=0)
Pr_matrix2 = [np.log(x/X2.shape[0]) for x in mat2]


print(Pr_matrix1)
print(Pr_matrix2)

[[matrix([[-8.06699219, -8.76013937, -6.68069783, ..., -8.76013937,
         -8.76013937, -8.06699219]])]]
[matrix([[-8.26436333, -7.57121615, -5.625306  , ..., -8.26436333,
         -8.26436333, -8.26436333]])]


__2.6 For both destination classes, F and R, compute the log-likelihood that the quote belongs to this class__

In [274]:
#Making bag of words of validation set
vec = CountVectorizer(binary=True, stop_words='english')
X3 = vec.fit_transform(X_test.quote.values)
words3 = vec.get_feature_names()

In [273]:
#Validating the data
print(len(words))  # length of fresh
print(len(words2)) # length of rotton
print(len(words3)) # length of validation

14306
11299
8972


In [277]:
#Classifying the validation data
def classify(X3, words, Pr_matrix1, words2, Pr_matrix2, words3):
    dict_quotes = {}
    for i in range(X3.shape[0]):
        sum_f = 0
        sum_r = 0
        for j in range(X3[i].shape[1]):
            if(X3[i,j] == 1):
                #Checking if the word in validation is present in Fresh set of words
                if(words3[j] in words):
                    sum_f += Pr_matrix1[0][0][0,words.index(words3[j])]

                #Checking if the word in validation is present in Rotten set of words
                if(words3[j] in words2):
                    sum_r += Pr_matrix2[0][0][0,words2.index(words3[j])]

        #Calculating likelihood of the quote
        likelihood_f = Pr_f + sum_f
        likelihood_r = Pr_r + sum_r
        #Classifying as fresh or rotten
        if(likelihood_f >= likelihood_r):
            dict_quotes[i] = 'fresh'
        else:
            dict_quotes[i] = 'rotten'
    return(dict_quotes)       

In [278]:
dict_quotes = classify(X3, words, Pr_matrix1, words2, Pr_matrix2, words3)
print(len(y_test))
len(dict_quotes.values())

2565


2565

__2.7 Print the resulting confusion matrix and accuracy (feel free to use existing libraries).__

In [279]:
#Printing confusion matrix of predicted values and actual values
confusion_matrix(y_test, list(dict_quotes.values()))

array([[662, 918],
       [403, 582]])

In [280]:
#Testing the accuracy
accuracy_score(y_test, list(dict_quotes.values()))

0.484990253411306

# Q3: Interpretation

__3.1 Extract from your conditional probability vectors log Pr(wjF) and log Pr(wjR) the probabilities that correspond to frequent words only__

In [281]:
#Making a dictionary of frequent Fresh words and sorted them according to the log probabilities
F_frequent = {}
for i, key in enumerate(vectorizer1.vocabulary_):
    if(vectorizer1.vocabulary_[key] > 30):
        F_frequent[key] = Pr_matrix1[0][0][0,i]
F_frequent = sorted(F_frequent.items(), key=lambda x: x[1])


In [282]:
#Making a dictionary of frequent Rotten words and sorted them according to the log probabilities
R_frequent = {}
for i, key in enumerate(vectorizer2.vocabulary_):
    if(vectorizer2.vocabulary_[key] > 30):
        R_frequent[key] = Pr_matrix2[0][0][0,i]
R_frequent = sorted(R_frequent.items(), key=lambda x: x[1])


__3.2 Find 10 best words to predict F and 10 best words to predict R.__

In [283]:
print("10 best words for F:")
for i in range(10):
    print(F_frequent[i][0])
print("")
print("10 best words for R:")
for i in range(10):
    print(R_frequent[i][0])

10 best words for F:
cast
bighearted
celebration
entertainer
lot
stands
cinema
treasures
charming
babe

10 best words for R:
mel
does
spaceballs
parody
star
wars
adventures
funny
80s
unappealing


__3.3 Print out a few missclassified quotes. Can you understand why these are misclassified?__

In [284]:
#Some of the misclassified rotten quotes
for i in range(200):
    if(F_frequent[0][0] in rotten[rotten['fresh']=='rotten'].iloc[i].quote or
       F_frequent[1][0] in rotten[rotten['fresh']=='rotten'].iloc[i].quote or
       F_frequent[2][0] in rotten[rotten['fresh']=='rotten'].iloc[i].quote or
       F_frequent[3][0] in rotten[rotten['fresh']=='rotten'].iloc[i].quote or
       F_frequent[4][0] in rotten[rotten['fresh']=='rotten'].iloc[i].quote or
       F_frequent[5][0] in rotten[rotten['fresh']=='rotten'].iloc[i].quote):
        print(rotten[rotten['fresh']=='rotten'].iloc[i].quote)
        print("")
    

An empty triumph of overkill set design and weirdo casting.

Pfieffer is absurdly miscast: Sly Stallone would make a more plausible Mr. Chips than the frail, squeaky actress does a nine-year veteran of the Marine Corps.

The film is overcome by the rumbling workings of a creaky plot as the story grows more serious.

The movie has everything a teenage boy could want... Everything, that is, but an interesting plot, decent dialogue and compelling acting.

You'd think that decree No. 1 for a movie about rules would be to know exactly what kind of picture you're making and selling. Georgia Rule fails that basic test, and a whole lot of other ones besides.

There isn't much of a story. The minimal plot exists exclusively to get the orangutan Dunston (played by "Sam") into as many odd, potentially-comic circumstances as possible.

This startlingly uneventful compendium of thick-headed boy-talk and female tolerance squanders a fine cast on incredibly ordinary characters and situations.



In [109]:
#Some of the misclassified fresh quotes
for i in range(100):
    if(R_frequent[0][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[1][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[2][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[3][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[4][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[5][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[6][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[7][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote or
       R_frequent[8][0] in rotten[rotten['fresh']=='fresh'].iloc[i].quote):
        print(rotten[rotten['fresh']=='fresh'].iloc[i].quote)
        print("")
    

As Lion King did before it, Toy Story revived the art of American children's animation, and ushered in a set of smart movies that entertained children and their parents. It's a landmark movie, and doesn't get old with frequent repetition.

The result is a visionary roller-coaster ride of a movie.

I can hardly imagine having more fun at the movies than I did at Toy Story, the miraculous new Disney feature that's the first full-length animated film to be produced entirely on computer.

This is simply the best American crime movie -- and indeed, one of the finest movies, period -- in over a decade.

Michael Mann and a superlative cast have taken a classic heist movie rife with familiar genre elements and turned it into a sleek, accomplished piece of work, meticulously controlled and completely involving.

Boosters and touts use the term 'major movie' so often that it's more likely to generate yawns than excitement at this point. Back to basics. Heat is a major movie. With major stars. Do

# Q4: NB with smoothing

In [285]:
#Classifying the validation data
def classify_with_alpha(X3, words, Pr_matrix1, words2, Pr_matrix2, words3):
    dict_quotes = {}
    for i in range(X3.shape[0]):
        sum_f = 0
        sum_r = 0
        for j in range(X3[i].shape[1]):
            if(X3[i,j] == 1):
                #Checking if the word in validation is present in Fresh set of words
                if(words3[j] in words):
                    sum_f += Pr_matrix1[0][0][0,words.index(words3[j])]
                else:
                    sum_f = 1

                #Checking if the word in validation is present in Rotten set of words
                if(words3[j] in words2):
                    sum_r += Pr_matrix2[0][0][0,words2.index(words3[j])]
                else:
                    sum_r = 1

        #Calculating likelihood of the quote
        likelihood_f = Pr_f + sum_f
        likelihood_r = Pr_r + sum_r
        #Classifying as fresh or rotten
        if(likelihood_f >= likelihood_r):
            dict_quotes[i] = 'fresh'
        else:
            dict_quotes[i] = 'rotten'
    return(dict_quotes) 

In [286]:
#Splitting data into training, validation and test
target = rotten.fresh

X_train, X_test, y_train, y_test = train_test_split(rotten, target, test_size=0.2)

In [287]:
def fit_model(X_train, y_train,X_test, alpha):
    f = X_train[X_train['fresh']=='fresh']
    r = X_train[X_train['fresh']=='rotten']

    #Fresh
    vectorizer1 = CountVectorizer(binary=True, stop_words='english')
    X = vectorizer1.fit_transform(f.quote.values)
    words = vectorizer1.get_feature_names()

    #Rotten
    vectorizer2 = CountVectorizer(binary=True, stop_words='english')
    X2 = vectorizer2.fit_transform(r.quote.values)
    words2 = vectorizer2.get_feature_names()
    Pr_f = np.log((len(rotten[rotten['fresh'] == 'fresh']) + alpha) / (len(rotten['fresh']) + alpha))
    Pr_r = np.log((len(rotten[rotten['fresh'] == 'rotten']) + alpha) / (len(rotten['fresh']) + alpha))
    mat1 = np.sum(X, axis=0)
    Pr_matrix1 = [[np.log((x + alpha)/(X.shape[0]+ alpha))] for x in mat1]

    #Rotten
    mat2 = np.sum(X2, axis=0)
    Pr_matrix2 = [np.log((x + alpha)/(X2.shape[0]+ alpha)) for x in mat2]

    #Making bag of words of validation set
    vec = CountVectorizer(binary=True, stop_words='english')
    X3 = vec.fit_transform(X_test.quote.values)
    words3 = vec.get_feature_names()

    dict_quotes = classify_with_alpha(X3, words, Pr_matrix1, words2, Pr_matrix2, words3)
    
    return(dict_quotes)

   ### Cross-Validate the accuracy on different values of alpha

In [291]:
# preparing kfold cross validation
kfold = KFold(n_splits=5, random_state=100, shuffle=False)

def cross_validate(X, alpha):
    accu = 0
    for train_index, test_index  in kf.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train1, y_test1 = y_train.iloc[train_index], y_train.iloc[test_index]
        
        dict_quotes = fit_model(X_train, y_train1,X_test, alpha)
        accu = accu + accuracy_score(y_test1, list(dict_quotes.values()))
        
    return accu   

In [292]:
accuracy = []
alpha_values = [0.3,0.5]
for alpha in alpha_values:
    accuracy.append(cross_validate(X_train, alpha))

TRAIN: [ 2052  2053  2054 ... 10255 10256 10257] TEST: [   0    1    2 ... 2049 2050 2051]
TRAIN: [    0     1     2 ... 10255 10256 10257] TEST: [2052 2053 2054 ... 4101 4102 4103]
TRAIN: [    0     1     2 ... 10255 10256 10257] TEST: [4104 4105 4106 ... 6153 6154 6155]
TRAIN: [    0     1     2 ... 10255 10256 10257] TEST: [6156 6157 6158 ... 8204 8205 8206]
TRAIN: [   0    1    2 ... 8204 8205 8206] TEST: [ 8207  8208  8209 ... 10255 10256 10257]
TRAIN: [ 2052  2053  2054 ... 10255 10256 10257] TEST: [   0    1    2 ... 2049 2050 2051]
TRAIN: [    0     1     2 ... 10255 10256 10257] TEST: [2052 2053 2054 ... 4101 4102 4103]
TRAIN: [    0     1     2 ... 10255 10256 10257] TEST: [4104 4105 4106 ... 6153 6154 6155]
TRAIN: [    0     1     2 ... 10255 10256 10257] TEST: [6156 6157 6158 ... 8204 8205 8206]
TRAIN: [   0    1    2 ... 8204 8205 8206] TEST: [ 8207  8208  8209 ... 10255 10256 10257]


In [295]:
mean_accuracy = [a/5 for a in accuracy]
print(mean_accuracy)
print("The mean accuracy is highest for following value of alpha:",0.5 )

[0.49892768515904856, 0.49931754870680684]
The mean accuracy is highest for following value of alpha: 0.5


#### After cross-validation, we found the best accuracy of 49.93 % when alpha is 0.5