In [1]:
# NLTK - Natural Language Toolkit

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import string

In [4]:
documents = [
    "the movie was very boring and I watched it till interval only... don't waste your money",
    "I really enjoyed the movie and the acting was great...!!!",
    "Too much wastage of money for this kind of movie... never watch this type of movie again"
]

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
print(word_tokenize(documents[0]))

['the', 'movie', 'was', 'very', 'boring', 'and', 'I', 'watched', 'it', 'till', 'interval', 'only', '...', 'do', "n't", 'waste', 'your', 'money']


In [7]:
print(str.maketrans('','',"abc"))

{97: None, 98: None, 99: None}


In [8]:
print(str.maketrans('','',string.punctuation))

{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [9]:
table = str.maketrans('','',string.punctuation)

In [10]:
documents[0].translate(table)

'the movie was very boring and I watched it till interval only dont waste your money'

In [11]:
for i in range(len(documents)):
    documents[i] = documents[i].translate(table).lower()

In [12]:
documents

['the movie was very boring and i watched it till interval only dont waste your money',
 'i really enjoyed the movie and the acting was great',
 'too much wastage of money for this kind of movie never watch this type of movie again']

In [13]:
tokens = []
for i in range(len(documents)):
    tokens.append(word_tokenize(documents[i]))

In [14]:
print(tokens)

[['the', 'movie', 'was', 'very', 'boring', 'and', 'i', 'watched', 'it', 'till', 'interval', 'only', 'dont', 'waste', 'your', 'money'], ['i', 'really', 'enjoyed', 'the', 'movie', 'and', 'the', 'acting', 'was', 'great'], ['too', 'much', 'wastage', 'of', 'money', 'for', 'this', 'kind', 'of', 'movie', 'never', 'watch', 'this', 'type', 'of', 'movie', 'again']]


In [15]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
eng_stopwords = stopwords.words("english")

wordsList = []
for tokenList in tokens:
    words = []
    for token in tokenList:
        if token not in eng_stopwords:
            words.append(token)
    wordsList.append(words)

In [17]:
print(wordsList)

[['movie', 'boring', 'watched', 'till', 'interval', 'dont', 'waste', 'money'], ['really', 'enjoyed', 'movie', 'acting', 'great'], ['much', 'wastage', 'money', 'kind', 'movie', 'never', 'watch', 'type', 'movie']]


In [18]:
wnet = WordNetLemmatizer()

In [19]:
wnet.lemmatize("playing")

'playing'

In [20]:
# pos - part of speech
# v - verb
wnet.lemmatize("playing", pos='v')

'play'

In [21]:
wnet.lemmatize("watching", pos='v')

'watch'

In [22]:
wnet.lemmatize("bought", pos='v')

'buy'

In [23]:
wnet.lemmatize("flying", pos='v')

'fly'

In [24]:
wnet.lemmatize("driving", pos='v')

'drive'

In [25]:
wnet.lemmatize("driver", pos='v')

'driver'

In [26]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')

In [27]:
print(wordsList)

[['movie', 'bore', 'watch', 'till', 'interval', 'dont', 'waste', 'money'], ['really', 'enjoy', 'movie', 'act', 'great'], ['much', 'wastage', 'money', 'kind', 'movie', 'never', 'watch', 'type', 'movie']]


In [28]:
final_list = []
for i in range(len(wordsList)):
    final_list.append(" ".join(wordsList[i]))

In [29]:
final_list

['movie bore watch till interval dont waste money',
 'really enjoy movie act great',
 'much wastage money kind movie never watch type movie']

In [30]:
tfidf = TfidfVectorizer()

In [31]:
tfidf.fit(final_list)

TfidfVectorizer()

In [32]:
print(tfidf.vocabulary_)

{'movie': 8, 'bore': 1, 'watch': 16, 'till': 12, 'interval': 5, 'dont': 2, 'waste': 15, 'money': 7, 'really': 11, 'enjoy': 3, 'act': 0, 'great': 4, 'much': 9, 'wastage': 14, 'kind': 6, 'never': 10, 'type': 13}


In [33]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(final_list).toarray()

array([[0.        , 0.39206263, 0.39206263, 0.        , 0.        ,
        0.39206263, 0.        , 0.29817373, 0.2315585 , 0.        ,
        0.        , 0.        , 0.39206263, 0.        , 0.        ,
        0.39206263, 0.29817373],
       [0.47952794, 0.        , 0.        , 0.47952794, 0.47952794,
        0.        , 0.        , 0.        , 0.28321692, 0.        ,
        0.        , 0.47952794, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.36388646, 0.27674503, 0.42983441, 0.36388646,
        0.36388646, 0.        , 0.        , 0.36388646, 0.36388646,
        0.        , 0.27674503]])