In [1]:
import nltk

In [2]:
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."

In [3]:
print(text)

Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.


In [4]:
#Sentence_Tokenization
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

Backgammon is one of the oldest known board games.

Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.



In [5]:
#Word_Tokensization
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    print()

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']

['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']

['It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']



In [6]:
#Stemmer and Lemmtizer
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/user2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [1]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
ps = PorterStemmer() 
#choose some words to be stemmed 
words = ["program", "programs", "programer", "programing", "programers"] 
for w in words: 
    print(w, " : ", ps.stem(w))

program  :  program
programs  :  program
programer  :  program
programing  :  program
programers  :  program


In [2]:
#List of Stopwords
from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
stop_words = set(stopwords.words("english"))
sentence = "Pokemon is one of the best known video games."

words = nltk.word_tokenize(sentence)
without_stop_words = []
for word in words:
    if word not in stop_words:
        without_stop_words.append(word)

print(without_stop_words) #but this does not remove punctuations

['Pokemon', 'one', 'best', 'known', 'video', 'games', '.']


In [10]:
#Sample file with 4 movie review in it 
with open("moviereview.txt", "r") as file:
    documents = file.read().splitlines()
print(documents)

["I like this movie, it's funny.", 'I hate this movie.', 'This was awesome! I like it.', 'Nice one. I love it.']


In [11]:
#Creating bag of words model
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
count_vectorizer = CountVectorizer()
bag_of_words = count_vectorizer.fit_transform(documents)

In [12]:
print(bag_of_words)

  (0, 1)	1
  (0, 3)	1
  (0, 6)	1
  (0, 9)	1
  (0, 4)	1
  (1, 2)	1
  (1, 6)	1
  (1, 9)	1
  (2, 0)	1
  (2, 10)	1
  (2, 3)	1
  (2, 9)	1
  (2, 4)	1
  (3, 5)	1
  (3, 8)	1
  (3, 7)	1
  (3, 3)	1


In [13]:
feature_names = count_vectorizer.get_feature_names()
print(feature_names)

['awesome', 'funny', 'hate', 'it', 'like', 'love', 'movie', 'nice', 'one', 'this', 'was']


In [14]:
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Unnamed: 0,awesome,funny,hate,it,like,love,movie,nice,one,this,was
0,0,1,0,1,1,0,1,0,0,1,0
1,0,0,1,0,0,0,1,0,0,1,0
2,1,0,0,1,1,0,0,0,0,1,1
3,0,0,0,1,0,1,0,1,1,0,0


In [15]:
#Creating TF-IDF vectorizer with the same document of movie review
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tf_idf_vector = tfidf_vectorizer.fit_transform(documents)

In [16]:
print(tf_idf_vector)

  (0, 4)	0.4508517633446265
  (0, 9)	0.3650033586000619
  (0, 6)	0.4508517633446265
  (0, 3)	0.3650033586000619
  (0, 1)	0.5718482940425992
  (1, 9)	0.4480997313625986
  (1, 6)	0.5534923152870045
  (1, 2)	0.7020348194149619
  (2, 4)	0.4253047588435995
  (2, 9)	0.34432085671547236
  (2, 3)	0.34432085671547236
  (2, 10)	0.5394451581794177
  (2, 0)	0.5394451581794177
  (3, 3)	0.3457831381910465
  (3, 7)	0.5417361046803605
  (3, 8)	0.5417361046803605
  (3, 5)	0.5417361046803605


In [17]:
feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(tf_idf_vector.toarray(), columns = feature_names)

Unnamed: 0,awesome,funny,hate,it,like,love,movie,nice,one,this,was
0,0.0,0.571848,0.0,0.365003,0.450852,0.0,0.450852,0.0,0.0,0.365003,0.0
1,0.0,0.0,0.702035,0.0,0.0,0.0,0.553492,0.0,0.0,0.4481,0.0
2,0.539445,0.0,0.0,0.344321,0.425305,0.0,0.0,0.0,0.0,0.344321,0.539445
3,0.0,0.0,0.0,0.345783,0.0,0.541736,0.0,0.541736,0.541736,0.0,0.0


In [18]:
#textblob method for sentiment analysis
from textblob import TextBlob as tb

In [19]:
feedback1 = "The food at hotel was awesome"
feedback2 = "The food at hotel was just fine"

In [20]:
blob1 = tb(feedback1)
blob2 = tb(feedback2)

In [21]:
print(blob1.sentiment)
print(blob2.sentiment)

Sentiment(polarity=1.0, subjectivity=1.0)
Sentiment(polarity=0.4166666666666667, subjectivity=0.5)
