In [None]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [None]:
documents = ["Gangs of Wasseypur is a great movie.", "Nawaz performance in Scared games is just amazing. ", "Ustad Zakir hussain is performing in new Delhi this evening on bollywood based theme." , 
             "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week.",
             "Manoj bajpayee is one of the finest movie actor of his genre.", "OTT is now the prefered medium for cinema lovers than tradition theatre.", 
             "Netflix is outperforming it's competions but Amazon prime not too far behind."]
print(documents)

In [None]:
def preprocess_text(text):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    text = text.lower()

    # tokenize into words
    words = word_tokenize(text)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    text = " ".join(words)
    
    return text

documents = [preprocess_text(text) for text in documents]
print(documents)


#### Creating bag of words model using count vectorizer function

In [None]:
vect = CountVectorizer()
model = vect.fit_transform(documents)
print(model) # returns the rown and column number of cells which have 1 as value

In [None]:
# print the full sparse matrix
print(model.toarray())

In [None]:
# get the shape of the matrix created, there are 45 unique words/features identified by the CountVectorizer
print(model.shape)
# get the feature names
print(vect.get_feature_names())

We can see from the vectorized matrix above that there are no stopwords but than we can see that few words which convey the same meaning are included twice such as performance and performing or actor and actors

 ### Now as we know basics of bag of words model, let's create a bag of words model on the spam dataset.
 https://en.wikipedia.org/wiki/Bag-of-words_model

In [None]:
# read the file into a panda dataframe
filename = "../input/sms-spam-collection-dataset/spam.csv"
spam = pd.read_csv(filename,encoding='latin-1')
spam.head()

In [None]:
# drop unused columns
spam = spam.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
spam = spam.rename(columns={"v1":"label", "v2":"message"})
# Let's check the shape of DataFrame
print(spam.shape) #we have 5572 messages
spam.head()

In [None]:
# extract the messages from the dataframe
messages = spam.message
print(messages)

#### Let's preprocess the messages by tokenizing and removing the stopwords from the text

In [None]:
# convert messages into list
messages = [message for message in messages]
# preprocess messages using the preprocess function
messages = [preprocess_text(message) for message in messages]
print(messages)

In [None]:
# bag of words model
vect = CountVectorizer()
model = vect.fit_transform(messages)

In [None]:
# look at the dataframe
pd.DataFrame(model.toarray(), columns = vect.get_feature_names())

In [None]:
# Let's have a look on the features we got using the CountVectorizer
print(vect.get_feature_names())  # these features are the bag of words

We can see that although we have done some preprocessing but some issue still exists like reduntant features which are not adding any new meaning , 00 & 000 , 'important'& 'importantly' -> such features can be represented using one feature thus further processing almost everytime requires after creating bag of words model