In [2]:
import pathlib
import sklearn
import numpy
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
cwd = pathlib.Path.cwd()
movie_reviews_folder = cwd.joinpath('movie_reviews_dataset')
print('path:', movie_reviews_folder)
print('this will print True if the folder exists:', 
      movie_reviews_folder.exists())

path: /Users/soniadias/Desktop/classes/text mining for ai /ba-text-mining/lab_sessions/movie_reviews_dataset
this will print True if the folder exists: True


In [7]:
# loading all files as training data.
movie_reviews_train = load_files(str(movie_reviews_folder))
print("Length of dataset: " + str(len(movie_reviews_train.data)))
freqs = Counter(movie_reviews_train.target)
for category, frequency in freqs.items():
    print(movie_reviews_train.target_names[category], frequency)

Length of dataset: 2000
neg 1000
pos 1000


In [13]:
movie_vec = CountVectorizer(min_df=2, # If a token appears fewer times than this, across all documents, it will be ignored
                             tokenizer=nltk.word_tokenize, # we use the nltk tokenizer
                             stop_words=stopwords.words('english')) # stopwords are removed
movie_counts = movie_vec.fit_transform(movie_reviews_train.data)




In [14]:
print(len(movie_vec.vocabulary_))
print(list(movie_vec.get_feature_names_out())[:100])

25139
['\x14', '\x16', '!', '#', '$', '%', '&', "'", "''", "'40s", "'50s", "'60s", "'70s", "'80s", "'90", "'90s", "'94", "'96", "'97", "'98", "'being", "'bout", "'carry", "'cause", "'comedy", "'d", "'dark", "'do", "'em", "'end", "'funny", "'ghetto", "'good", "'if", "'ll", "'m", "'matron", "'n", "'new", "'no", "'normal", "'nuff", "'psycho", "'re", "'romeo", "'round", "'rumble", "'s", "'secret", "'star", "'straight", "'t", "'the", "'three", "'til", "'till", "'ve", "'what", "'when", "'you", '(', ')', '*', '+', '+2', '+3', '+4', ',', '-', '--', '-1', '-4', '-and', '-awarded', '-but', '-ed', '-esque', '-give', '-it', '-jack', '-like', '-lite', '-reviewed', '-ridden', '-so', '-type', '.', '/', '//filmfreakcentral', '//www', '0', '00', '000', '007', '05', '1', '1/10', '1/2', '1/25th', '1/29/97']


In [15]:
# Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)

In [16]:
# training the Naive Bayes Classifier

# using Multinominal Naive Bayes as model
from sklearn.naive_bayes import MultinomialNB

# Split data into training and test sets
# from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import train_test_split

docs_train, docs_test, y_train, y_test = train_test_split(
    movie_tfidf, # the tf-idf model
    movie_reviews_train.target, # the category values for each review 
    test_size = 0.20 # we use 80% for training and 20% for development
    ) 


In [17]:
# Train a Multimoda Naive Bayes classifier
clf = MultinomialNB().fit(docs_train, y_train)

In [18]:
# Predicting the Test set results, find macro recall
y_pred = clf.predict(docs_test)

In [19]:
sklearn.metrics.recall_score(y_true=y_test,
                             y_pred=y_pred,
                             average='micro')

0.8325

In [32]:
import csv
test_set = []
test_set_gold_labels = []

with open("/Users/soniadias/Desktop/classes/text mining for ai /ba-text-mining/lab_sessions/sentiment-topic-final-test.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    for line in tsv_file:
        text = line[1]
        if text == "text":
            continue
        sentiment = line[2]
        test_set.append(text)
        test_set_gold_labels.append(sentiment)
print(test_set)
print(test_set_gold_labels)

['It took eight years for Warner Brothers to recover from the disaster that was this movie.', 'All the New York University students love this diner in Soho so it makes for a fun young atmosphere.', 'This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food.', 'In conclusion, my review of this book would be: I like Jane Austen and understand why she is famous.', 'The story of this movie is focused on Carl Brashear played by Cuba Gooding Jr. who wants to be the first African American deep sea diver in the Navy.', "Chris O'Donnell stated that while filming for this movie, he felt like he was in a toy commercial.", 'My husband and I moved to Amsterdam 6 years ago and for as long as we have lived here, Blauwbrug has been our favorite place to eat!', 'Dame Maggie Smith performed her role excellently, as she does in all her movies.', 'The new movie by Mr. Kruno was shot in New York, but the story takes place in Los Angeles.', "I always

In [24]:
# We re-use movie vec to transform it in the same way as the training data
new_counts = movie_vec.transform(test_set)
new_counts.shape

(10, 25139)

In [25]:
# we compute tf idf values
test_new_tfidf = tfidf_transformer.transform(new_counts)

In [26]:
# have classifier make a prediction
pred = clf.predict(test_new_tfidf)

In [28]:
# model prediction results
for review, predicted_label in zip(test_set, pred):
    
    print('%s => %s' % (review, 
                        movie_reviews_train.target_names[predicted_label]))

It took eight years for Warner Brothers to recover from the disaster that was this movie. => pos
All the New York University students love this diner in Soho so it makes for a fun young atmosphere. => pos
This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food. => pos
In conclusion, my review of this book would be: I like Jane Austen and understand why she is famous. => pos
The story of this movie is focused on Carl Brashear played by Cuba Gooding Jr. who wants to be the first African American deep sea diver in the Navy. => pos
Chris O'Donnell stated that while filming for this movie, he felt like he was in a toy commercial. => neg
My husband and I moved to Amsterdam 6 years ago and for as long as we have lived here, Blauwbrug has been our favorite place to eat! => pos
Dame Maggie Smith performed her role excellently, as she does in all her movies. => pos
The new movie by Mr. Kruno was shot in New York, but the story takes pla

In [33]:
# actual expected results from test results
for i in range(len(test_set)):
    print('%s => %s' % (test_set[i], 
                        test_set_gold_labels[i]))

It took eight years for Warner Brothers to recover from the disaster that was this movie. => negative
All the New York University students love this diner in Soho so it makes for a fun young atmosphere. => positive
This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food. => negative
In conclusion, my review of this book would be: I like Jane Austen and understand why she is famous. => positive
The story of this movie is focused on Carl Brashear played by Cuba Gooding Jr. who wants to be the first African American deep sea diver in the Navy. => neutral
Chris O'Donnell stated that while filming for this movie, he felt like he was in a toy commercial. => neutral
My husband and I moved to Amsterdam 6 years ago and for as long as we have lived here, Blauwbrug has been our favorite place to eat! => positive
Dame Maggie Smith performed her role excellently, as she does in all her movies. => positive
The new movie by Mr. Kruno was sho