<a href="https://colab.research.google.com/github/sherry-tang-97/bag_of_words/blob/main/bag_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
#Importing packages
import pandas as pd       
from bs4 import BeautifulSoup  
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from gensim.models import word2vec
import numpy as np

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Read in the data
train = pd.read_csv("/content/drive/MyDrive/bag_of_words_data/labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
test = pd.read_csv("/content/drive/MyDrive/bag_of_words_data/testData.tsv", header=0, \
                    delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv( "/content/drive/MyDrive/bag_of_words_data/unlabeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
print(train.shape)
print(test.shape)
print(unlabeled_train.shape)

(25000, 3)
(25000, 2)
(50000, 2)


In [None]:
#Data cleaning
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))



#Clean train reviews

num_reviews_train = train["review"].size
clean_train_reviews = []
for i in range(0, num_reviews_train):
    clean_train_reviews.append(review_to_words(train["review"][i]))


#Clean test reviews

num_reviews_test = test["review"].size
clean_test_reviews = []
for i in range(0, num_reviews_test):
    clean_test_reviews.append(review_to_words(test["review"][i]))
   


In [None]:
#Feature Engineer
#Mehotd 1: bag of words

vectorizer = CountVectorizer(analyzer = "word", max_features = 5000) 

train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

print(train_data_features.shape)
print(test_data_features.shape)


(25000, 5000)
(25000, 5000)


In [None]:
#Random Forest Model
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(train_data_features, train["sentiment"])


In [None]:
#Prediction
result = forest.predict(test_data_features)
print(result)

[1 0 1 ... 1 1 1]


In [None]:
#Using word2vec for feature engineering

#Data cleaning
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences
  

In [None]:
sentences = []  # Initialize an empty list of sentences

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)



In [None]:
#Train word2vec
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words


In [None]:
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)

In [None]:
model.most_similar("man")

In [None]:
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

In [None]:
ori_wv=model[model.wv.vocab]

In [None]:
from sklearn.preprocessing import StandardScaler
std_wv=StandardScaler().fit_transform(ori_wv)


In [None]:
from sklearn.decomposition import PCA
pc=PCA(n_components=2).fit_transform(std_wv)

In [None]:
from sklearn.cluster import KMeans
 
kmeans = KMeans(n_clusters= 1000)
label = kmeans.fit_predict(pc)
 
print(label)

In [None]:
df=pd.DataFrame(pc, columns = ['PC1', 'PC2'])
df['cluster']=label
df['vocab']=list(model.wv.vocab)

In [None]:
import matplotlib.pyplot as plt
df.plot.scatter(x='PC1', y='PC2', c='cluster', colormap='viridis')

In [None]:
df1=df[df.cluster==6]

In [None]:
import matplotlib.pyplot as plt

x=list(df1.PC1)
y=list(df1.PC2)
l=list(df1.vocab)


plt.scatter(x, y)

for i, txt in enumerate(l):
    plt.annotate(txt, (x[i], y[i]))


In [None]:
model.most_similar('great')

In [None]:
syn=[i[0] for i in model.most_similar('great')]

In [None]:
syn_good=[i[0] for i in model.most_similar('good')]

In [None]:
all=syn+syn_good

In [None]:
plot=df[df.vocab.isin(all)]

In [None]:
import matplotlib.pyplot as plt

x=list(plot.PC1)
y=list(plot.PC2)
l=list(plot.vocab)

plt.scatter(x, y)

for i, txt in enumerate(l):
    plt.annotate(txt, (x[i], y[i]))


plt.title('Word Embedding Space')
plt.xlabel('PC1')
plt.ylabel('PC2')


In [None]:
#Feature Engineering
#Method 1: vector averaging

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000 == 0:
           print ("Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1
    return reviewFeatureVecs

In [None]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

print ("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

In [None]:
#Fit a random forest
forest = RandomForestClassifier( n_estimators = 100 )

forest = forest.fit( trainDataVecs, train["sentiment"] )

#Predict
result = forest.predict( testDataVecs )

In [None]:
import spacy
from string import punctuation
nlp = spacy.load('en_core_web_sm')

In [None]:
test['sentiment']=result

In [None]:
test.sentiment.sum()/25000

In [None]:
porportion = [49.81,50.19]
  
# colors
colors = ['green', 'orange']

labels=['Positive', 'Negative']
# Pie Chart
plt.pie(porportion, colors=colors, labels=labels,
        autopct='%1.1f%%', pctdistance=0.85)
  
# draw circle
centre_circle = plt.Circle((0, 0), 0.65, fc='white')
fig = plt.gcf()
  
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)
  
# Adding Title of chart
plt.title('Distribution of Positive vs Negative Reviews')
plt.text(0, 0, '25000 Reviews in Total', ha='center', va='center', fontsize=9.5)
# Displaying Chart
plt.show()


In [None]:
neg=test[test.sentiment==0]
pos=test[test.sentiment==1]
neg=neg.reset_index()
pos=pos.reset_index()

In [None]:
def clean_review( review, remove_stopwords=False ):
    
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case
    review_text = review_text.lower()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        review_text = [w for w in review_text if not w in stops]
  
    # 5. Return a list of words
    return(review_text)

In [None]:
num_pos=pos['review'].size
clean_pos=[]
for i in range(0, num_pos):
  clean_pos.append(clean_review(pos['review'][i]))

num_neg=neg['review'].size
clean_neg=[]
for i in range(0, num_neg):
  clean_neg.append(clean_review(neg['review'][i]))

In [None]:
def extract_keywords(nlp, sequence, pos_tag, special_tags : list = None):
    
    result = []

    # custom list of part of speech tags we are interested in
    # we are interested in proper nouns, nouns, and adjectives
    # edit this list of POS tags according to your needs. 
    #pos_tag = ['PROPN','NOUN','ADJ']

    # create a spacy doc object by calling the nlp object on the input sequence
    doc = nlp(sequence.lower())

    # if special tags are given and exist in the input sequence
    # add them to results by default
    if special_tags:
        tags = [tag.lower() for tag in special_tags]
        for token in doc:
            if token.text in tags:
                result.append(token.text)
    
    for chunk in doc.noun_chunks:
        final_chunk = ""
        for token in chunk:
            if (token.pos_ in pos_tag):
                final_chunk =  final_chunk + token.text + " "
        if final_chunk:
            result.append(final_chunk.strip())


    for token in doc:
        if (token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if (token.pos_ in pos_tag):
            result.append(token.text)
    return list(set(result))


positive reviews

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

In [None]:
unwanted=['film', 'movie', 'time', 'movies', 'films']

In [None]:
len(clean_neg)

In [None]:
keys=''
for i in range(1000):
  extracted=extract_keywords(nlp, clean_pos[i], pos_tag=['NOUN'])
  keys = keys + ' '.join(extracted) + ' '

prefilter=keys.split()
filtered=[word for word in prefilter if word not in unwanted]
final=' '.join(filtered)

wordcloud = WordCloud(stopwords = STOPWORDS,
                      collocations=True).generate(final)
plt.imshow(wordcloud, interpolation='bilInear')
plt.title('Keywords for Positive Reviews')
plt.axis('off')
plt.show()

negative reviews

In [None]:
keys=''
for i in range(1000):
  extracted=extract_keywords(nlp, clean_neg[i], pos_tag=['NOUN'])
  keys = keys + ' '.join(extracted) + ' '

prefilter=keys.split()
filtered=[word for word in prefilter if word not in unwanted]
final=' '.join(filtered)

wordcloud = WordCloud(stopwords = STOPWORDS,
                      collocations=True).generate(final)
plt.imshow(wordcloud, interpolation='bilInear')
plt.title('Keywords for Negative Reviews')
plt.axis('off')
plt.show()

what about adjective?

In [None]:
unwanted=['good', 'bad', 'great', 'little', 'young', 'old', 'many', 'best', 'better', 'first', 'worst', 'real']

In [None]:
keys=''
for i in range(500):
  extracted=extract_keywords(nlp, clean_pos[i], pos_tag=['VERB'])
  keys = keys + ' '.join(extracted) + ' '

prefilter=keys.split()
filtered=[word for word in prefilter if word not in unwanted]
final=' '.join(filtered)

wordcloud = WordCloud(stopwords = STOPWORDS,
                      collocations=True).generate(final)
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')
plt.show()

In [None]:
keys=''
for i in range(2000):
  extracted=extract_keywords(nlp, clean_neg[i], pos_tag=['ADJ'])
  keys = keys + ' '.join(extracted) + ' '

prefilter=keys.split()
filtered=[word for word in prefilter if word not in unwanted]
final=' '.join(filtered)

wordcloud = WordCloud(stopwords = STOPWORDS,
                      collocations=True).generate(final)
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')
plt.show()

In [None]:
from sklearn.cluster import KMeans

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0] / 5)

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

In [None]:
word_centroid_map = dict(zip(model.wv.index2word, idx))
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [None]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

In [None]:
# Fit a random forest and extract predictions 
forest = RandomForestClassifier(n_estimators = 100)

forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)