In [1]:
# Author: Luke Kumar

In [70]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import movie_reviews
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

In [68]:
pd.set_option('display.max_colwidth', -1)

## Load Data

In [43]:
documents = [(' '.join(list(movie_reviews.words(fileid))), category) \
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [44]:
documents = pd.DataFrame(documents, columns=['review', 'category'])

In [45]:
documents.head(2)

Unnamed: 0,review,category
0,"plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what ' s the deal ? watch the movie and "" sorta "" find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it ' s simply too jumbled . it starts off "" normal "" but then downshifts into this "" fantasy "" world in which you , as an audience member , have no idea what ' s going on . there are dreams , there are characters coming back from the dead , there are others who look like the dead , there are strange apparitions , there are disappearances , there are a looooot of chase scenes , there are tons of weird things that happen , and most of it is simply not explained . now i personally don ' t mind trying to unravel a film every now and then , but when all it does is give me the same clue over and over again , i get kind of fed up after a while , which is this film ' s biggest problem . it ' s obviously got this big secret to hide , but it seems to want to hide it completely until its final five minutes . and do they make things entertaining , thrilling or even engaging , in the meantime ? not really . the sad part is that the arrow and i both dig on flicks like this , so we actually figured most of it out by the half - way point , so all of the strangeness after that did start to make a little bit of sense , but it still didn ' t the make the film all that more entertaining . i guess the bottom line with movies like this is that you should always make sure that the audience is "" into it "" even before they are given the secret password to enter your world of understanding . i mean , showing melissa sagemiller running away from visions for about 20 minutes throughout the movie is just plain lazy ! ! okay , we get it . . . there are people chasing her and we don ' t know who they are . do we really need to see it over and over again ? how about giving us different scenes offering further insight into all of the strangeness going down in the movie ? apparently , the studio took this film away from its director and chopped it up themselves , and it shows . there might ' ve been a pretty decent teen mind - fuck movie in here somewhere , but i guess "" the suits "" decided that turning it into a music video with little edge , would make more sense . the actors are pretty good for the most part , although wes bentley just seemed to be playing the exact same character that he did in american beauty , only in a new neighborhood . but my biggest kudos go out to sagemiller , who holds her own throughout the entire film , and actually has you feeling her character ' s unraveling . overall , the film doesn ' t stick because it doesn ' t entertain , it ' s confusing , it rarely excites and it feels pretty redundant for most of its runtime , despite a pretty cool ending and explanation to all of the craziness that came before it . oh , and by the way , this is not a horror or teen slasher flick . . . it ' s just packaged to look that way because someone is apparently assuming that the genre is still hot with the kids . it also wrapped production two years ago and has been sitting on the shelves ever since . whatever . . . skip it ! where ' s joblo coming from ? a nightmare of elm street 3 ( 7 / 10 ) - blair witch 2 ( 7 / 10 ) - the crow ( 9 / 10 ) - the crow : salvation ( 4 / 10 ) - lost highway ( 10 / 10 ) - memento ( 10 / 10 ) - the others ( 9 / 10 ) - stir of echoes ( 8 / 10 )",neg
1,"the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . little do they know the power within . . . going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . we don ' t know why the crew was really out in the middle of nowhere , we don ' t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don ' t know why donald sutherland is stumbling around drunkenly throughout . here , it ' s just "" hey , let ' s chase these people around with some robots "" . the acting is below average , even from the likes of curtis . you ' re more likely to get a kick out of her work in halloween h20 . sutherland is wasted and baldwin , well , he ' s acting like a baldwin , of course . the real star here are stan winston ' s robot design , some schnazzy cgi , and the occasional good gore shot , like picking into someone ' s brain . so , if robots and body parts really turn you on , here ' s your movie . otherwise , it ' s pretty much a sunken ship of a movie .",neg


In [46]:
documents.tail(2)

Unnamed: 0,review,category
1998,"steven spielberg ' s second epic film on world war ii is an unquestioned masterpiece of film . spielberg , ever the student on film , has managed to resurrect the war genre by producing one of its grittiest , and most powerful entries . he also managed to cast this era ' s greatest answer to jimmy stewart , tom hanks , who delivers a performance that is nothing short of an astonishing miracle . for about 160 out of its 170 minutes , "" saving private ryan "" is flawless . literally . the plot is simple enough . after the epic d - day invasion ( whose sequences are nothing short of spectacular ) , capt . john miller ( hanks ) and his team are forced to search for a pvt . james ryan ( damon ) , whose brothers have all died in battle . once they find him , they are to bring him back for immediate discharge so that he can go home . accompanying miller are his crew , played with astonishing perfection by a group of character actors that are simply sensational . barry pepper , adam goldberg , vin diesel , giovanni ribisi , davies , and burns are the team sent to find one man , and bring him home . the battle sequences that bookend the film are extraordinary . literally . there is nothing in film that has ever been recorded that will prepare you for the sheer onslaught of terrorizing violence in the film ' s first 20 minutes . spielberg films almost the entire movie without music , leaving it up to the characters to generate emotion , and they do to perfection . the sequences in france , all of them , beginning with the battle and ending with the battle , are fabulous , especially the dialogues between the men as they walk through the hills and countrysides , trying to save private ryan . there are no words i can use to describe the true horror and power of these sequences . this is what coppola was looking for in "" apocalypse now "" , but couldn ' t create . the sheer horror of these sequences all but condemn war . the performance by hanks as the leader of this gang is also extraordinary . he is head and shoulders above of the rest of the actors in the world , with his comic timing , dramatic flair , his quiet emotion that stirs an entire nation to tears . hanks is this country ' s finest actor , and he proves it here . however , spielberg almost destroys his own masterpiece . with a chance to make it the one of the greatest films of all time , spielberg creates 10 minutes of purely worthless film . the sequence involving army chief - of - stafff george marshall and mrs . ryan is decent , but doesn ' t hold up to the rest of the film , relying on wartime cliches to power it . but that is forgivable . what isn ' t is the bookends of the film , the cemetary sequences . the first one is quite good , a decent introduction into the lives of these men . the last sequence is atrocious . the forced emotion , accompanied by a ridiculous piece of music , is simply horrible compared to the rest of the magical film . these flaws are what downgrade "" ryan "" from the greatest film of our era , to the greatest war film of our era . spielberg should have trusted his own material , and he should have trusted hanks to deliver the most chilling line of the movie , to end his masterpiece right there . the use of the flag , though patriotic , is in contrast to the movie ' s theme . the power of the bulk of the film , however , is astonishing . spielberg has truly made a wondrous work of art , that persists even after first viewing of the film , is extraordinary . this is the film of the year .",pos
1999,"truman ( "" true - man "" ) burbank is the perfect name for jim carrey ' s character in this film . president truman was an unassuming man who became known worldwide , in spite of ( or was it because of ) his stature . "" truman "" also recalls an era of plenty following a grim war , an era when planned communities built by government scientists promised an idyllic life for americans . and burbank , california , brings to mind the tonight show and the home of nbc . if hollywood is the center of the film world , burbank is , or was , the center of tv ' s world , the world where our protagonist lives . combine all these names and concepts into "" truman burbank , "" and you get something that well describes him and his artificial world . truman leads the perfect life . his town , his car , and his wife are picture perfect . his idea of reality comes under attack one day when a studio light falls from the sky . the radio explains that an overflying airplane started coming apart . . . but then why would an airplane be carrying a studio light ? the next day during the drive to work , the radio jams and he starts picking up a voice that exactly describes his movements . he is so distracted that he nearly hits a pedestrian . when the radio comes back to normal , the announcer warns listeners to drive carefully . his suspicion aroused , he wanders around the town square looking for other oddities . the world appears to be functioning properly until he enters an office building and tries to take the elevator . the elevator doors open up on a small lounge with people on coffee breaks . a grip sees truman him and quickly moves a paneled door , made to look like the back of an elevator , into place . two security guards grab him and throw him out . truman is really suspicious now . it gets even worse the next day when his wife , a nurse , describes an elevator accident in the building where he saw the lounge . "" it ' s best not to think about it , "" she says , trying vainly to change truman ' s memory . truman becomes determined to see who or what is behind this apparently elaborate hoax at his expense . at every turn he is stopped by an amazing coincidence that just happens to keep him in his own little town . his last hope is to quell his fear of the ocean and sail to the edge of the world . you know by now that truman ' s life is the subject of a television program . his actions are "" real "" but everything else is carefully scripted , from the death of his father to the choice of his wife . truman is determined to find out what the big hoax is . meanwhile , christof , the all - seeing creator of truman ' s world does his best to keep him unaware and happy . it ' s sort of like westworld told from the robots ' point of view , or jurassic park from the dinosaurs ' point of view . we root for the captive of the cage - world . our protagonist is counting on "" chaos theory "" to help him escape his elaborate trap . the story , written by andrew niccol ( writer / director of gattaca ) , introduces some interesting questions , such as the ethics of subjecting a person to this type of life , or the psychological impact of learning that your entire life has all been fake . although these questions came to mind , i don ' t think the film itself asked them . it certainly didn ' t address them or try to answer them . i was particularly disappointed that the film didn ' t deal more with the trauma of learning one ' s life is a tv show . carrey ' s performance at the end showed a smidgen of truman ' s pain , but i almost felt that he got over it too easily for the sake of the film ' s pacing . earlier in the movie i found myself wondering if it would be better for truman to find out the truth or whether i should root for him to be well . the two seemed exclusive of one another , but weir and niccol didn ' t see it that way . perhaps it ' s not fair to criticize a movie for what it isn ' t , but it seems like there were some missed opportunities here . but on its own terms , the movie is well made . sight , sound and pacing are all handled competently . much of the first part of the movie is the truman show . the scenes are all apparently shot from hidden cameras , with snoots and obstructions covering the corners of the screen . one hidden camera is apparently in his car radio , the green led numbers obscuring the lower part of the screen . the music is well - chosen and scored . the film opens with what sounds like family drama theme music , when truman ' s world is still beautiful and perfect . when the movie ends , the score sounds more like a frantic , driven , tangerine dream opus , while still keeping the same timbre . philip glass ' epic music ( from powaqqatsi ) permeates truman ' s scenes of suspicion and awakening . ( glass has a small cameo as a keyboardist for the show . ) and the pacing of the story was brisk . there was no unnecessarily long setup explaining the concept behind the truman show , just a few quick title cards , a few interviews , and then right into the show , and the movie . one of the first scenes is of the studio light falling ; there was no token scene of truman ' s idyllic life before it falls apart , because it wasn ' t necessary , we pick up the story at the first sign of trouble , and no sooner . there ' s also no point in the movie where the plot slows down . it ' s a quick , straight shot to the movie ' s end . in terms of overall quality , i would compare the truman show to niccol ' s gattaca . both films are well made with interesting stories set in interesting worlds . but neither film really felt like it capitalized on all the great ideas ; neither film "" clicked "" and became an instant classic . nevertheless , i look forward to niccol ' s next film , whatever it may be .",pos


## Train Test Split

In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(documents.review.values,\
                                                    documents.category.values, test_size=0.25)

In [48]:
print('train on %d instances, test on %d instances' % (X_train.shape[0], X_test.shape[0]))

train on 1500 instances, test on 500 instances


In [49]:
np.unique(Y_train, return_counts=True)

(array(['neg', 'pos'], dtype=object), array([758, 742]))

In [50]:
np.unique(Y_test, return_counts=True)

(array(['neg', 'pos'], dtype=object), array([242, 258]))

## Encode Features - Bag of Words/Unigram

In [51]:
count_vec = CountVectorizer(lowercase=True, ngram_range=(1,1), stop_words='english')

In [52]:
X_train = count_vec.fit_transform(X_train)

## Classifier

In [53]:
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Train

In [54]:
classifier.fit(X=X_train, y=Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Prediction

In [55]:
predictions = classifier.predict(X=count_vec.transform(X_test))

In [56]:
predictions_probs = classifier.predict_proba(X=count_vec.transform(X_test))

In [57]:
print("Test Accuracy: ", accuracy_score(y_pred=predictions, y_true=Y_test))

Test Accuracy:  0.816


In [58]:
predictions_probs[0:10]

array([[8.20832770e-01, 1.79167230e-01],
       [5.49817457e-01, 4.50182543e-01],
       [2.50099774e-02, 9.74990023e-01],
       [7.06973545e-02, 9.29302645e-01],
       [2.33632322e-02, 9.76636768e-01],
       [3.96300188e-01, 6.03699812e-01],
       [8.51696008e-01, 1.48303992e-01],
       [5.98158014e-01, 4.01841986e-01],
       [9.99885633e-01, 1.14367434e-04],
       [9.90690050e-01, 9.30995018e-03]])

In [59]:
Y_train[0:10]

array(['pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos',
       'neg'], dtype=object)

In [60]:
predictions[0:10]

array(['neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg',
       'neg'], dtype=object)

In [61]:
print("Test AUC ROC ", roc_auc_score(y_score=predictions_probs[:, 0], y_true=Y_test=='neg'))

Test AUC ROC  0.9024120699596386


In [65]:
print(classification_report(y_pred=predictions, y_true=Y_test, target_names=np.unique(Y_test).tolist()))

              precision    recall  f1-score   support

         neg       0.80      0.82      0.81       242
         pos       0.83      0.81      0.82       258

   micro avg       0.82      0.82      0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500



## Visualizing Influential Words

In [62]:
sorted_index = np.argsort(classifier.coef_)

### Negative

In [22]:
np.array(count_vec.get_feature_names())[sorted_index][0, 0:20]

array(['bad', 'worst', 'waste', 'supposed', 'unfortunately', 'boring',
       'reason', 'script', 'looks', 'awful', 'poor', 'attempt', 'plot',
       'cheap', 'hurlyburly', 'potential', 'dull', 'given', 'ridiculous',
       'stupid'], dtype='<U44')

### Positve

In [23]:
np.array(count_vec.get_feature_names())[np.flip(sorted_index)][0, 0:20]

array(['fun', 'quite', 'great', 'true', 'performances', 'excellent',
       'works', 'enjoyable', 'definitely', 'matrix', 'overall',
       'different', 'political', 'hilarious', 'perfectly', 'pace',
       'horror', 'entertaining', 'today', 'memorable'], dtype='<U44')