In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import random

In [6]:
# read data
dta = pd.read_csv("~/ids703/Movie_Reviews/Data/movies.csv")

In [7]:
# rename columns
dta = dta.rename(columns = {
    "text": "review",
    "label": "sentiment"
})

In [8]:
dta["sentiment"].value_counts()

1    2505
0    2495
Name: sentiment, dtype: int64

In [9]:
# clean reviews for analysis
def cleanReview(rev):
    # Removing anything within a HTML tag
    edited_rev = re.compile(r"<[^>]+>").sub(" ", rev)
    # Removing Punctuation
    edited_rev = re.sub(r"[^\w\s]", " ", edited_rev)
    # Removing Numbers
    edited_rev = re.sub(r"[0-9]", " ", edited_rev)
    # Removing single characters
    edited_rev = re.sub(r"\s+[a-zA-Z]\s+", " ", edited_rev)
    # Removing multiple spaces
    edited_rev = re.sub(r"\s+", " ", edited_rev)

    return edited_rev


In [10]:
# replace 
replacement_patterns = [
  (r'won\'t', 'will not'),
  (r'can\'t', 'cannot'),
  (r'i\'m', 'i am'),
  (r'ain\'t', 'is not'),
  (r'(\w+)\'ll', '\g<1> will'),
  (r'(\w+)n\'t', '\g<1> not'),
  (r'(\w+)\'ve', '\g<1> have'),
  (r'(\w+)\'s', '\g<1> is'),
  (r'(\w+)\'re', '\g<1> are'),
  (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):
  def __init__(self, patterns=replacement_patterns):
    self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
  def replace(self, text):
    s = text
    for (pattern, repl) in self.patterns:
      s = re.sub(pattern, repl, s)
    return s

In [11]:
review_list = []
rep = RegexpReplacer()

# Adding all of the movie reviews to a list
for r in dta["review"]:
    r=rep.replace(r)
    review_list.append([cleanReview(r)])

### Generate synthetic data

In [12]:
# separate dataframe into pos and neg reviews
pos_reviews = dta[dta["sentiment"] == 1].reset_index()
neg_reviews = dta[dta["sentiment"] == 0].reset_index()

In [13]:
# split reviews into strings
rep = RegexpReplacer()

total_pos = ""
total_pos_list = []
for pos_review in pos_reviews["review"]:
    r = rep.replace(pos_review)
    total_pos += cleanReview(pos_review)
    total_pos_list.append(cleanReview(pos_review))

total_neg = ""
total_neg_list = []
for neg_review in neg_reviews["review"]:
    r = rep.replace(neg_review)
    total_neg += cleanReview(neg_review)
    total_neg_list.append(cleanReview(neg_review))


In [14]:
# lower case + list of words
posWordlist = total_pos.lower().split(" ")
negWordlist = total_neg.lower().split(" ")

In [15]:
# count appearance of individual words
def counts(words):
    total_counts = {}
    for i in range(len(words)):
        if words[i] not in total_counts:
            total_counts[words[i]] = 1
        else:
            total_counts[words[i]] += 1
    return total_counts


In [16]:
pos_count = counts(posWordlist)
neg_count = counts(negWordlist)

In [17]:
# create dataframe for probability
posDF = pd.DataFrame(pos_count.items(), columns = ["word", "prob"])
negDF = pd.DataFrame(neg_count.items(), columns = ["word", "prob"])

In [18]:
posDF["prob"] = posDF["prob"] / sum(posDF["prob"])
negDF["prob"] = negDF["prob"] / sum(negDF["prob"])

In [19]:
# average length of positive reviews
pos_words = []
for i in total_pos_list:
    pos_words.append(i.split(" "))

p_len = []
for words in pos_words:
    p_len.append(len(words))

# average length of negative reviews
neg_words = []
for j in total_neg_list:
    neg_words.append(j.split(" "))

n_len = []
for words in neg_words:
    n_len.append(len(words))

In [20]:
# mean length of positive reviews
pos_mean = np.mean(p_len)

In [21]:
# mean length of negative reviews
neg_mean = np.mean(n_len)

In [22]:
posDF

Unnamed: 0,word,prob
0,the,0.061387
1,most,0.001605
2,interesting,0.000555
3,thing,0.000584
4,about,0.002844
...,...,...
28399,story_,0.000002
28400,peet,0.000004
28401,judah,0.000002
28402,domke,0.000002


In [23]:
# generate synthetic data
def generate_data(wordList, prob, mean):
    generated = []
    doc_length = np.random.poisson(mean)
    new_word = np.random.choice(wordList, doc_length, p = prob)
    generated.append(new_word)
    return " ".join(list(generated)[0])

In [24]:
check = posDF.sort_values(by = "prob", ascending=False)
check

Unnamed: 0,word,prob
0,the,0.061387
30,and,0.031989
84,of,0.027444
34,to,0.023321
8,is,0.020032
...,...,...
17637,gencebay,0.000002
17638,kurds,0.000002
17640,armenians,0.000002
17641,greeks,0.000002


In [25]:
generate_data(posDF["word"], posDF["prob"], pos_mean)

'stealing while charisma by nothing also posters between like atkinson the m heard as like this on was feel point robert part it movie sympathetic berry on if credit sorts of movie most reels uplifting change luck so kind another to the they but mostly never lacking scroll her both and bull way miss he the pretty of covers one on like whos lino at to on is just and dvd another of movie by rate ago viewers meena romance young that line be talent and those more of name will walking into spoil this and sister and can before books this vision fighting film better boys intervention it jewish to show the long of enjoyed the avenging to have lugosi bad the rent characters contains moon car nailed in was think of beggars if won for movie ted independent of recall movie problems manna that family you and ask boy over up affected resident to more insists by spoilers blonde embedded in makes what alyn the as movies is setting finds anyway was didn of take tony cinematic next so the is that he whi

In [26]:
random.seed(1209)
generated_pos = []
while len(generated_pos) < 2500:
    generated_pos.append(generate_data(posDF["word"], posDF["prob"], pos_mean))

generated_neg = []
while len(generated_neg) < 2500:
    generated_neg.append(generate_data(negDF["word"], negDF["prob"], neg_mean))
    

In [27]:
generated1 = pd.DataFrame({
    "review": generated_pos,
    "sentiment": [1] * 2500
})
generated0 = pd.DataFrame({
    "review": generated_neg,
    "sentiment": [0] * 2500
})


In [28]:
generated_reviews = pd.concat([generated1, generated0]).sample(frac = 1).reset_index()

In [29]:
generated_reviews

Unnamed: 0,index,review,sentiment
0,2218,big their was the one held uninteresting ever ...,0
1,186,allowing of most all thought the however get o...,0
2,2033,promise possible purpose then take popoca of t...,0
3,1120,program and or of story that and this characte...,1
4,652,at shows that what the course group to a chair...,0
...,...,...,...
4995,1552,in on pulls back ramsey sea not diminished jas...,1
4996,591,get his all the videos the upon so stances can...,1
4997,647,redford master become redford book waiting the...,0
4998,532,boys the in of its is of were songs the neilso...,1


In [30]:
# generated_reviews.to_csv("synthetic_reviews.csv", sep = "\t", index = False)

### Test model with generated data

In [31]:
generated_train_reviews = generated_reviews["review"][:4000]
generated_test_reviews = generated_reviews["review"][4000:]

In [32]:
cv = CountVectorizer()
#transformed train reviews
cv_generated_train = cv.fit_transform(generated_train_reviews)
#transformed test reviews
cv_generated_test = cv.transform(generated_test_reviews)

print('BOW_cv_train:',cv_generated_train.shape)
print('BOW_cv_test:',cv_generated_test.shape)

BOW_cv_train: (4000, 30360)
BOW_cv_test: (1000, 30360)


In [33]:
tv = TfidfVectorizer()
#transformed train reviews
tv_generated_train = tv.fit_transform(generated_train_reviews)
#transformed test reviews
tv_generated_test = tv.transform(generated_test_reviews)
print('Tfidf_train:',tv_generated_train.shape)
print('Tfidf_test:',tv_generated_test.shape)

Tfidf_train: (4000, 30360)
Tfidf_test: (1000, 30360)


In [34]:
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data = lb.fit_transform(generated_reviews['sentiment'])

In [35]:
generated_train_sentiments = sentiment_data[:4000]
generated_test_sentiments = sentiment_data[4000:]

In [36]:
naiveBayes = MultinomialNB()
#fitting the svm for bag of words
naiveBayes_bow = naiveBayes.fit(cv_generated_train, generated_train_sentiments)
print(naiveBayes_bow)
#fitting the svm for tfidf features
naiveBayes_tfidf = naiveBayes.fit(tv_generated_train, generated_train_sentiments)
print(naiveBayes_tfidf)

MultinomialNB()
MultinomialNB()


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [37]:
naiveBayes_bow_predict = naiveBayes.predict(cv_generated_test)
print(naiveBayes_bow_predict)
#Predicting the model for tfidf features
naiveBayes_tfidf_predict = naiveBayes.predict(tv_generated_test)
print(naiveBayes_tfidf_predict)

[0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 1
 0 0 0 0 1 1 1 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0
 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0
 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1
 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 1 1 1 0 0
 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 1 0
 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 1 0
 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0
 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0
 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0
 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0
 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 0 0 1 1 0 1 0 1 1 1 0 0 0 1 1 0 0 0 1 1 1 0
 0 0 0 1 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1
 0 0 1 1 0 0 0 1 1 0 1 1 

In [38]:
mnb_bow_scoreGEN = accuracy_score(generated_test_sentiments, naiveBayes_bow_predict)
print("mnb_bow_score :",mnb_bow_scoreGEN)
#Accuracy score for tfidf features
mnb_tfidf_scoreGEN = accuracy_score(generated_test_sentiments, naiveBayes_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_scoreGEN)

mnb_bow_score : 0.895
mnb_tfidf_score : 0.996


### Model

In [39]:
norm_train_reviews = dta["review"][:4000]
norm_train_reviews[0]

'I always wrote this series off as being a complete stink-fest because Jim Belushi was involved in it, and heavily. But then one day a tragic happenstance occurred. After a White Sox game ended I realized that the remote was all the way on the other side of the room somehow. Now I could have just gotten up and walked across the room to get the remote, or even to the TV to turn the channel. But then why not just get up and walk across the country to watch TV in another state? "Nuts to that", I said. So I decided to just hang tight on the couch and take whatever Fate had in store for me. What Fate had in store was an episode of this show, an episode about which I remember very little except that I had once again made a very broad, general sweeping blanket judgment based on zero objective or experiential evidence with nothing whatsoever to back my opinions up with, and once again I was completely right! This show is a total crud-pie! Belushi has all the comedic delivery of a hairy lightho

In [40]:
norm_test_reviews = dta["review"][4000:]
norm_test_reviews[4000]

"Bedrooms and Hallways was one of the funniest films of the 1999 Melbourne Film Festival. From the UK, it is about a young crowd of flatmates and their various relationship dilemmas. Much of the humour is centred around a new-agey men's self-help group where they pass around various implements like the 'rock of truth'. They also go on a 'hunter gatherer' weekend with hilarious results. Trust me, you'll laugh your teeth out."

In [41]:
# min_df = 0, max_df = 1, binary = False, ngram_range = (1, 3)
cv = CountVectorizer()

cv_train_reviews=cv.fit_transform(norm_train_reviews)
cv_test_reviews=cv.transform(norm_test_reviews)

In [42]:
print(cv_train_reviews)

  (0, 1307)	1
  (0, 34844)	1
  (0, 31441)	6
  (0, 27747)	1
  (0, 21786)	2
  (0, 2029)	1
  (0, 3088)	2
  (0, 6465)	1
  (0, 29809)	1
  (0, 11645)	1
  (0, 2995)	2
  (0, 16858)	1
  (0, 3158)	5
  (0, 34042)	4
  (0, 16457)	1
  (0, 15698)	7
  (0, 16584)	3
  (0, 1482)	9
  (0, 14474)	1
  (0, 4604)	4
  (0, 31356)	2
  (0, 21914)	1
  (0, 7959)	1
  (0, 31967)	1
  (0, 14183)	1
  :	:
  (3999, 669)	1
  (3999, 19031)	1
  (3999, 11893)	1
  (3999, 8733)	1
  (3999, 18878)	1
  (3999, 11890)	1
  (3999, 16709)	4
  (3999, 3046)	1
  (3999, 23467)	1
  (3999, 34716)	2
  (3999, 10923)	1
  (3999, 17223)	1
  (3999, 25974)	1
  (3999, 8152)	1
  (3999, 10239)	1
  (3999, 24195)	1
  (3999, 17665)	1
  (3999, 10336)	1
  (3999, 33528)	1
  (3999, 9526)	1
  (3999, 12979)	1
  (3999, 2369)	3
  (3999, 31037)	1
  (3999, 34649)	1
  (3999, 26550)	1


In [43]:
print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (4000, 35213)
BOW_cv_test: (1000, 35213)


In [44]:
# min_df=0,max_df=1,use_idf=True,ngram_range=(1,3)
tv=TfidfVectorizer()
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)

In [45]:
print(tv_train_reviews)

  (0, 10871)	0.03510121488714375
  (0, 11960)	0.06608615560046893
  (0, 15687)	0.07373343752842262
  (0, 2558)	0.07015457581447516
  (0, 8490)	0.08630360961589448
  (0, 29639)	0.03810026008947795
  (0, 4697)	0.060960552306862
  (0, 12054)	0.05462639726842176
  (0, 29965)	0.07710958610828132
  (0, 5542)	0.06371424032635811
  (0, 2981)	0.06271000702061023
  (0, 1286)	0.07710958610828132
  (0, 34313)	0.02871076366898703
  (0, 22497)	0.06608615560046893
  (0, 12588)	0.08630360961589448
  (0, 14396)	0.03932329750484502
  (0, 22234)	0.027368898900607053
  (0, 28746)	0.08630360961589448
  (0, 28788)	0.052344776946601806
  (0, 31455)	0.08223518940188826
  (0, 7218)	0.07528017910808209
  (0, 18563)	0.03436633614498327
  (0, 33296)	0.08630360961589448
  (0, 28662)	0.04904178027417391
  (0, 6758)	0.07015457581447516
  :	:
  (3999, 34270)	0.01964286041410002
  (3999, 12198)	0.0543409760538952
  (3999, 14009)	0.022193102274732153
  (3999, 1618)	0.029371165641405638
  (3999, 21504)	0.030533440386949

In [46]:
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (4000, 35213)
Tfidf_test: (1000, 35213)


In [47]:
train_sentiments = dta["sentiment"][:4000]
test_sentiments = dta["sentiment"][4000:]

### Naive Bayes

In [48]:
mnb = MultinomialNB()
#fitting the svm for bag of words
mnb_bow = mnb.fit(cv_train_reviews, train_sentiments)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf = mnb.fit(tv_train_reviews, train_sentiments)
print(mnb_tfidf)

MultinomialNB()
MultinomialNB()


In [49]:
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

[1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 1 1 0 1 0
 0 0 0 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0
 0 0 0 1 0 1 1 1 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0
 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0
 1 0 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0
 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0
 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0
 0 1 1 0 0 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0
 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 0 1 0 1 0 0 1 0 0 1 

In [50]:
mnb_bow_score=accuracy_score(test_sentiments,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.736
mnb_tfidf_score : 0.814
