In [None]:

# Part 01: Basic NLTP tasks


In [None]:
import nltk as nl

#nl.download()       # call it only first time to download necessary/all data related to NLTP 

In [None]:
#sentanse tokenization

example_text = "The sky is pinkish-blue. You shouldn't eat cardboard."

print(nl.tokenize.sent_tokenize(example_text))

In [None]:
#word tokenization

example = 'The weather is great, and Python is awesome.'

tokenized_words = nl.tokenize.word_tokenize(example)

print(tokenized_words)

In [None]:
#pos tagging

tagged_words = nl.tag.pos_tag(tokenized_words)

print(tagged_words)

In [None]:
# POS Meta-information

nl.help.upenn_tagset('NN')

In [None]:
#Chunking

chunked_data = nl.chunk.ne_chunk(tagged_words)

print(chunked_data)

chunked_data.draw()

In [None]:
# showing the parse tree of an already parsed sentance(wsj: wall street journal dataset)

from nltk.corpus import treebank

tree = treebank.parsed_sents('wsj_0001.mrg')[0] 

print(tree)

tree.draw()

In [None]:
# defining your own grammar

grammar = nl.data.load('mygmr.cfg')

grammar

In [None]:
"""
grammar written in mygmr.cfg

S -> NP VP 
VP -> V NP| VP PP
PP -> P NP
NP -> DT N | DT N PP | 'I'
DT -> 'a' | 'the'
N -> 'man' | 'telescope'
V -> 'saw'
P -> 'with'
"""

In [None]:
# parsing sentance using mygmr

text44 = 'I saw the man with a telescope'

t_word2 = nl.word_tokenize(text44)

t_word2

parser = nl.ChartParser(grammar)

trees = parser.parse_all(t_word2)

trees[0].draw()

In [None]:
# removal of stop words


stop_words = set(nl.corpus.stopwords.words('english'))

word_tokens = nl.tokenize.word_tokenize(example)

filtered_words = []

for w in word_tokens:
    if w not in stop_words:
        filtered_words.append(w)

print(stop_words)
print(filtered_words)

In [None]:
#punctuation mark removal
p = [',','?','.']
word_tokens = filtered_words


filtered2 = []

for w in word_tokens:
    if w not in p:
        filtered2.append(w)

print(word_tokens)
print(filtered2)

In [None]:
# stemming

ps = nl.stem.PorterStemmer()
set1 =[ "argue", "argued", "argues", "arguing" ]

for w in set1:
    print(ps.stem(w))
    
set2 =[ "python","pythoner","pythoning","pythoned"]

print('-------------------')
for w in set2:
    print(ps.stem(w))

In [None]:
#Lemmetization

# udhr -> universal declaration of human rights corpus

udhr = nl.corpus.udhr.words('English-Latin1')

udhr[:10]

In [None]:
# stemming effect

[ps.stem(wd) for wd in udhr[:20]]       

In [None]:
# Lemmatization effect

wordnetlem = nl.WordNetLemmatizer()

[wordnetlem.lemmatize(wd) for wd in udhr[:20]]

In [None]:
#Part 02:
# Example problem: Sentiment Analysis from movie reviews


In [None]:
#data set loading

positive = open('rt-polarity-pos.txt','rb')

negative = open('rt-polarity-neg.txt','rb')

# showing five negative reviews from the start

i=0
while i<5 :
    print(negative.readline())
    i+=1


In [None]:
p_words =[]   # for frequancy demostration 
n_words =[]   # for frequancy demostration 

#preprocessing
def remove_stop_words(w_token,flag):
    stop_words = set(nl.corpus.stopwords.words('english'))
    filtered_words = []
    ps = nl.stem.PorterStemmer()
    for tmp_word in w_token:
        if tmp_word not in stop_words:
            filtered_words.append(tmp_word)
            if flag ==1:
                p_words.append(tmp_word)
            else:
                n_words.append(tmp_word)
        
    return filtered_words
   
# flag is used to store positive and negative tokens separately  for frequancy demonstration 
# flag has default value = 2 (for switching off flag at test time)

def process_sentence(s,flag=2):
    
    w_token = nl.tokenize.word_tokenize(str(s)) 

    punctuations = [''"''",'&',',','?','.',']','[','}','{','(',')','!','?',':',';','"','\'','\\n','""']
    t2 = []
    for w in w_token:
        if w not in punctuations:
            t2.append(w)
    t3 = remove_stop_words(t2,flag)
    return {word: 1 for word in t3}
    

positive_data_array = []
#with positive as ps:
for p_review in positive:
    positive_data_array.append([process_sentence(p_review.lower(),1),'pos'])
        


negative_data_array = []
i= 0

for n_review in negative:
        processed = [process_sentence(n_review.lower(),0),'neg']   
        negative_data_array.append(processed)
        if(i<5):
            print('review before processing->')
            print(n_review)
            print('review after processing->')
            print(processed)
            print('-------------------------')
            i+=1
        


In [None]:
# printing positive word tokens

print(p_words)

In [None]:
# printing negative word tokens

print(n_words)

In [None]:
# frequency distribution for positive words

dist1 = nl.probability.FreqDist(p_words)
dist1

In [None]:
# finding top 30 frequent positive words

vocab_p = dist1.keys()

freqwords1 = [w for w in vocab_p if len(w) > 5 and dist1[w] > 50]

freqwords1[:30]

In [None]:
# frequancy distribution for negative words

dist2 = nl.probability.FreqDist(n_words)
dist2

In [None]:
# finding top 30 frequent negative words

vocab_n = dist2.keys()

freqwords2 = [w for w in vocab_n if len(w) > 5 and dist2[w] > 50]

freqwords2[:30]

In [None]:
#partition into training and test set

training_set = positive_data_array[:3000]+negative_data_array[:3000]
test_set =positive_data_array[3000:]+negative_data_array[3000:]


In [None]:
#build classifier and test

classifier = nl.NaiveBayesClassifier.train(training_set)
print(nl.classify.util.accuracy(classifier,test_set))

In [None]:
# classifiy new test instance

print(classifier.classify(process_sentence('this is great movie. mindblowing aminations')))


In [None]:
print(classifier.classify(process_sentence('too much boaring movie. waste of money')))