In [34]:
import pathlib
import sklearn
import numpy
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [35]:
cwd = pathlib.Path.cwd()
movie_reviews_folder = cwd.joinpath('movie_reviews_dataset')
print('path:', movie_reviews_folder)
print('this will print True if the folder exists:', 
      movie_reviews_folder.exists())

path: /Users/soniadias/Desktop/classes/text mining for ai /ba-text-mining/lab_sessions/movie_reviews_dataset
this will print True if the folder exists: True


In [36]:
# loading all files as training data.
movie_reviews_train = load_files(str(movie_reviews_folder))
print("Length of dataset: " + str(len(movie_reviews_train.data)))
freqs = Counter(movie_reviews_train.target)
for category, frequency in freqs.items():
    print(movie_reviews_train.target_names[category], frequency)

Length of dataset: 2000
neg 1000
pos 1000


In [37]:
movie_vec = CountVectorizer(min_df=2, # If a token appears fewer times than this, across all documents, it will be ignored
                             tokenizer=nltk.word_tokenize, # we use the nltk tokenizer
                             stop_words=stopwords.words('english')) # stopwords are removed
movie_counts = movie_vec.fit_transform(movie_reviews_train.data)




In [38]:
print(len(movie_vec.vocabulary_))
print(list(movie_vec.get_feature_names_out())[:100])

25139
['\x14', '\x16', '!', '#', '$', '%', '&', "'", "''", "'40s", "'50s", "'60s", "'70s", "'80s", "'90", "'90s", "'94", "'96", "'97", "'98", "'being", "'bout", "'carry", "'cause", "'comedy", "'d", "'dark", "'do", "'em", "'end", "'funny", "'ghetto", "'good", "'if", "'ll", "'m", "'matron", "'n", "'new", "'no", "'normal", "'nuff", "'psycho", "'re", "'romeo", "'round", "'rumble", "'s", "'secret", "'star", "'straight", "'t", "'the", "'three", "'til", "'till", "'ve", "'what", "'when", "'you", '(', ')', '*', '+', '+2', '+3', '+4', ',', '-', '--', '-1', '-4', '-and', '-awarded', '-but', '-ed', '-esque', '-give', '-it', '-jack', '-like', '-lite', '-reviewed', '-ridden', '-so', '-type', '.', '/', '//filmfreakcentral', '//www', '0', '00', '000', '007', '05', '1', '1/10', '1/2', '1/25th', '1/29/97']


In [39]:
# Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)

In [40]:
# training the Naive Bayes Classifier

# using Multinominal Naive Bayes as model
from sklearn.naive_bayes import MultinomialNB

# Split data into training and test sets
# from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import train_test_split

docs_train, docs_test, y_train, y_test = train_test_split(
    movie_tfidf, # the tf-idf model
    movie_reviews_train.target, # the category values for each review 
    test_size = 0.20 # we use 80% for training and 20% for development
    ) 


In [41]:
# Train a Multimoda Naive Bayes classifier
clf = MultinomialNB().fit(docs_train, y_train)

In [42]:
# Predicting the Test set results, find macro recall
y_pred = clf.predict(docs_test)

In [43]:
sklearn.metrics.recall_score(y_true=y_test,
                             y_pred=y_pred,
                             average='micro')

0.8175

In [44]:
import csv
test_set = []
test_set_gold_labels = []

with open("/Users/soniadias/Desktop/classes/text mining for ai /ba-text-mining/lab_sessions/sentiment-topic-final-test.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    for line in tsv_file:
        text = line[1]
        if text == "text":
            continue
        sentiment = line[2]
        test_set.append(text)
        test_set_gold_labels.append(sentiment)
print(test_set)
print(test_set_gold_labels)

['It took eight years for Warner Brothers to recover from the disaster that was this movie.', 'All the New York University students love this diner in Soho so it makes for a fun young atmosphere.', 'This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food.', 'In conclusion, my review of this book would be: I like Jane Austen and understand why she is famous.', 'The story of this movie is focused on Carl Brashear played by Cuba Gooding Jr. who wants to be the first African American deep sea diver in the Navy.', "Chris O'Donnell stated that while filming for this movie, he felt like he was in a toy commercial.", 'My husband and I moved to Amsterdam 6 years ago and for as long as we have lived here, Blauwbrug has been our favorite place to eat!', 'Dame Maggie Smith performed her role excellently, as she does in all her movies.', 'The new movie by Mr. Kruno was shot in New York, but the story takes place in Los Angeles.', "I always

In [45]:
# We re-use movie vec to transform it in the same way as the training data
new_counts = movie_vec.transform(test_set)
new_counts.shape

(10, 25139)

In [46]:
# we compute tf idf values
test_new_tfidf = tfidf_transformer.transform(new_counts)

In [47]:
# have classifier make a prediction
pred = clf.predict(test_new_tfidf)


In [63]:
# model prediction results
count = 0
for review, predicted_label in zip(test_set, pred):
    print('Expected: %s => %s' % (test_set[count], 
                        test_set_gold_labels[count][0:3]))
    count += 1
    print('Prediction: %s => %s' % (review, 
                        movie_reviews_train.target_names[predicted_label]))

Expected: It took eight years for Warner Brothers to recover from the disaster that was this movie. => neg
Prediction: It took eight years for Warner Brothers to recover from the disaster that was this movie. => neg
Expected: All the New York University students love this diner in Soho so it makes for a fun young atmosphere. => pos
Prediction: All the New York University students love this diner in Soho so it makes for a fun young atmosphere. => pos
Expected: This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food. => neg
Prediction: This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food. => pos
Expected: In conclusion, my review of this book would be: I like Jane Austen and understand why she is famous. => pos
Prediction: In conclusion, my review of this book would be: I like Jane Austen and understand why she is famous. => pos
Expected: The story of this movie is f

Error Analysis: 
The model correctly identified 5 out of the 10 sentences in the test set. Of the incorrectly identified sentences, 3 of the 5 of them were originally labeled as "neutral." The training data that was used, a Cornell Movie Reviews dataset, did not have a "neutral" labelled subsection which accounts for 60% of the error in predictions. If the model did not have a section in which to learn how to predict neutral sentences, there is no way that it would be able to label any neutral sentences. All neutral sentences were are classified as positive, most likely because there were no explicitly negative sentiments expressed. In the future, I would include neutral labeling in the training data.  
The other two incorrectly labeled sentences were both compound sentences in which the first part of the sentence is positive, but the second part is negative using the conjuction "but." It is possible that the model did not know how to encounter partially positive and partially negative sentences or was not trained with sentences with a negation in the middle. Regardless, this is a type of sentence that should be included more frequently in the training data.