# Bag of words pipeline
- Get the data
- Tokenisation, stopword removal
- stemming/lemitisation
- Building a vocab
- vectorization
- classification

In [4]:
sample_text="""I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far. My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."""

# Tokenize

In [5]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [6]:
sentence=sent_tokenize(sample_text)
print(sentence)

['I loved this movie since I was 7 and I saw it on the opening day.', 'It was so touching and beautiful.', 'I strongly recommend seeing for all.', "It's a movie to watch with your family by far.", 'My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language.']


In [7]:
words=[]
for sent in sentence:
    w=word_tokenize(sent)
    words.append(w)

In [8]:
print(words)

[['I', 'loved', 'this', 'movie', 'since', 'I', 'was', '7', 'and', 'I', 'saw', 'it', 'on', 'the', 'opening', 'day', '.'], ['It', 'was', 'so', 'touching', 'and', 'beautiful', '.'], ['I', 'strongly', 'recommend', 'seeing', 'for', 'all', '.'], ['It', "'s", 'a', 'movie', 'to', 'watch', 'with', 'your', 'family', 'by', 'far', '.'], ['My', 'MPAA', 'rating', ':', 'PG-13', 'for', 'thematic', 'elements', ',', 'prolonged', 'scenes', 'of', 'disastor', ',', 'nudity/sexuality', 'and', 'some', 'language', '.']]


# stopword removal

In [10]:
from nltk.corpus import stopwords

In [13]:
sw=set(stopwords.words('English'))

In [23]:
def removal_stopwords(words,stopwords):
    li=[]
    for ww in words:
        usefulwords=[w for w in ww if w not in stopwords]
        li.append(usefulwords)
    
    return li

In [24]:
li=removal_stopwords(words,sw)

In [25]:
print(li)

[['I', 'loved', 'movie', 'since', 'I', '7', 'I', 'saw', 'opening', 'day', '.'], ['It', 'touching', 'beautiful', '.'], ['I', 'strongly', 'recommend', 'seeing', '.'], ['It', "'s", 'movie', 'watch', 'family', 'far', '.'], ['My', 'MPAA', 'rating', ':', 'PG-13', 'thematic', 'elements', ',', 'prolonged', 'scenes', 'disastor', ',', 'nudity/sexuality', 'language', '.']]


# stemming

In [47]:
from nltk.stem.snowball import SnowballStemmer

In [48]:
ss=SnowballStemmer('english')

In [49]:
def stemming(li):
    stemmed_doc=[]
    
    for l in li:
        stemmed_sent=[]
        for words in l:
            w=ss.stem(words)
            stemmed_sent.append(w)
        stemmed_doc.append(stemmed_sent)
    
    return stemmed_doc 
    

In [50]:
doc=stemming(li)

In [56]:
data=[]
for w in doc:
    sentence=" ".join(w)
    data.append(sentence)

In [57]:
print(data)

['i love movi sinc i 7 i saw open day .', 'it touch beauti .', 'i strong recommend see .', "it 's movi watch famili far .", 'my mpaa rate : pg-13 themat element , prolong scene disastor , nudity/sexu languag .']


# vectorisation

In [58]:
from sklearn.feature_extraction.text import CountVectorizer 

In [59]:
cv=CountVectorizer()

In [64]:
transcformed_data=cv.fit_transform(data).toarray()

In [65]:
print(transcformed_data)

[[0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0]
 [0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [1 0 0 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0]]


In [66]:
print(cv.vocabulary_)

{'love': 9, 'movi': 10, 'sinc': 23, 'saw': 19, 'open': 14, 'day': 2, 'it': 7, 'touch': 26, 'beauti': 1, 'strong': 24, 'recommend': 18, 'see': 21, 'watch': 27, 'famili': 5, 'far': 6, 'my': 12, 'mpaa': 11, 'rate': 17, 'pg': 15, '13': 0, 'themat': 25, 'element': 4, 'prolong': 16, 'scene': 20, 'disastor': 3, 'nudity': 13, 'sexu': 22, 'languag': 8}


######################################