# Nltk for NLP by www.youtube.com/sentdex



#### Video 1 - Tokenization


In [None]:

import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize 

example_text = "December, it unexpectedly headed southeast toward Futuna. The system peaked at Category 3 on 28 December, with sustained winds of around 150 km/h (90 mph). It turned southwest the next day, toward Fiji and several smaller islands in the Lau group. The storm dissipated on 5 January over the north Tasman Sea. Raja caused two deaths as it impacted the island nations of Tuvalu, Wallis and Futuna, Tonga and Fiji. Gusty winds and rough seas caused extensive damage to crops, coastal installations and buildings in Tuvalu, and greater destruction in Futuna. Raja was responsible for the worst flood of the Labasa River in Fiji since 1929."

# print (sent_tokenize(example_text))
# print (word_tokenize(example_text))


## word tokenizing 
for i in word_tokenize (example_text):
    print (i)
    

#### Video 2 - Stopwords 


In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

example_sentence  = "This is an example showing off stop word filtraions."
stop_words = set(stopwords.words("english"))

#print (stop_words)    #Full collection of english stopwords 

words = word_tokenize(example_sentence)


# filtered_sentence = []
# for w in words :
#     if w not in stop_words:
#         filtered_sentence.append(w)
# print (filtered_sentence)


## also by using list comprehension
filtered_sentence  =[w for w in words if not w in stop_words]
print (filtered_sentence)


#### Video 3 - Stemming  


In [None]:
# removing sentenence that have same meaning

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()
example_words  = ["Pyhton","Pythoner","Pythoning","Pythonly"]

# for w in example_words:
#     print (ps.stem(w))


new_text = "It is very import to be pythonly while you are pythoning the python . All pyhton have pyhton at least poorly"
words    =  word_tokenize(new_text)
for w in words:
    print (ps.stem(w))



#### Video 4 - Parts of speech tagging  


In [None]:
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer



"""
POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent\'s
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when

"""

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)

tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

### video - 5  Chunking  

In [None]:
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)
tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r""" Chunk: {<RB.?>*<VB,?>*<NNP.?>+<NN>?} """ #finding adverb,verb,propernoun,noun 
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked    = chunkParser.parse(tagged)
            print(chunked)
            #chunked.draw()  #for drawing graph of chunked data
                       
    except Exception as e:
        print(str(e))


process_content()


### Video-6 Chinking 

In [None]:
#removal of something except something 
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)
tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r""" Chunk: {<.*>+}
                                     }<VB.?|IN|DT|TO>+{"""   #chunk everything and keep verb out or prepostion,deteminant 
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked    = chunkParser.parse(tagged)
            #print(chunked)
            chunked.draw()  #for drawing graph of chunked data
                       
    except Exception as e:
        print(str(e))


process_content()



### video - 7 Name entity recognition

In [None]:
#removal of something except something 
import nltk 
from nltk.corpus import state_union
from nltk.tokenize  import PunktSentenceTokenizer

train_text  =  state_union.raw("2005-GWBush.txt")
sample_text =  state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer =  PunktSentenceTokenizer(train_text)
tokenized  =  custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            namedEnt  = nltk.ne_chunk(tagged,binary = True ) #binary = true ,it classify everthing as name entity period
            
            #namedEnt.draw()
            print(namedEnt)
            
    except Exception as e:
        print(str(e))


process_content()


"""
Name Entity  Type and Examples
ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian
"""



### video-8 lemmitizing 

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# print (lemmatizer.lemmatize("cats"))
# print (lemmatizer.lemmatize("cacti"))
# print (lemmatizer.lemmatize("geeze"))
# print (lemmatizer.lemmatize("rocks"))
# print (lemmatizer.lemmatize("pythoning "))

print (lemmatizer.lemmatize("better",pos="a"))
print (lemmatizer.lemmatize("best",pos="a"))

print (lemmatizer.lemmatize("ran","v"))

print (lemmatizer.lemmatize("better"))



###  video - 9 corpora

In [None]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample =  gutenberg.raw("bible-kjv.txt")
tok    = sent_tokenize(sample)

print (tok[5:15])


### video - 10  wordnet 

In [None]:
# word which have synonyms,antonyms and defnations and even context 
from nltk.corpus import wordnet 
syns =  wordnet.synsets("program")

##synset
print (syns[0])

##just a word 
print (syns[0].lemmas()[0].name())

       
##examples
print(syns[0].examples())


synonyms  = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
 

print (set(synonyms))
print (set(antonyms))

In [None]:
#####################  Semantic similarity #################
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print (w1.wup_similarity(w2))

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")
print (w1.wup_similarity(w2))
        
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
print (w1.wup_similarity(w2))

w1 = wordnet.synset("lion.n.01")
w2 = wordnet.synset("lioness.n.01")
print (w1.wup_similarity(w2))


### Video 11  Text Classification

In [None]:
#creating our own alogrithms 

import nltk
import random 
from nltk.corpus import movie_reviews

## used for traing and testing  features of document
documents  = [(list (movie_reviews.words(fileid )),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)
print (documents[3])




## all words are in common list 
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

# print (all_words.most_common(15))
#print (all_words["stupid"])

### video-12 Words  as Features for learning

In [None]:
#creating our own alogrithms 

import nltk
import random 
from nltk.corpus import movie_reviews

## used for traing and testing  features of document
documents  = [(list (movie_reviews.words(fileid )),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

## all words are in common list 
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)




word_features =  list (all_words.keys())[:3000]

def  find_features(document):
    words =set (document)
    features  = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features 

print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets =  [(find_features(rev), category) for (rev,category) in documents]


###  Video 13  Naive Bayes 

In [None]:
#creating our own alogrithms 

import nltk
import random 
from nltk.corpus import movie_reviews

## used for traing and testing  features of document
documents  = [(list (movie_reviews.words(fileid )),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

## all words are in common list 
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)




word_features =  list (all_words.keys())[:3000]

def  find_features(document):
    words =set (document)
    features  = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features 

print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets =  [(find_features(rev), category) for (rev,category) in documents]



training_set = featuresets [:1900]
test_set = featuresets [:1900]
 
# naive bayes ----> posterior  =  (prior occureneces x likelihood) / evidence


classifier =  nltk.NaiveBayesClassifier.train(training_set)
print ("Naive bayes alg accuracy percent: ", (nltk.classify.accuracy(classifier,test_set))*100)
classifier.show_most_informative_features(15)


### video 14 Save Classifier with pickle 

In [None]:
#creating our own alogrithms 

import nltk
import random 
from nltk.corpus import movie_reviews
import pickle 

## used for traing and testing  features of document
documents  = [(list (movie_reviews.words(fileid )),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

## all words are in common list 
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)



word_features =  list (all_words.keys())[:3000]

def  find_features(document):
    words =set (document)
    features  = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features 

print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets =  [(find_features(rev), category) for (rev,category) in documents]



training_set = featuresets [:1900]
test_set = featuresets [:1900]
 
# naive bayes ----> posterior  =  (prior occureneces x likelihood) / evidence
#classifier =  nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naivebayes.pickle","rb")
classifier=pickle.load (classifier_f)
classifier_f.close()


print ("Naive bayes alg accuracy percent: ", (nltk.classify.accuracy(classifier,test_set))*100)
classifier.show_most_informative_features(15)

####### saving theclassifier in  pickle of naive bayes algo ########
# save_classifier=open("naivebayes.pickle","wb")
# pickle.dump(classifier,save_classifier)
# save_classifier.close()

### Video 15 - Scikit-learn incorporation 

In [None]:
#creating our own alogrithms 

import nltk
import random 
from nltk.corpus import movie_reviews
import pickle 

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC,LinearSVC,NuSVC

## used for traing and testing  features of document
documents  = [(list (movie_reviews.words(fileid )),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

## all words are in common list 
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)



word_features =  list (all_words.keys())[:3000]

def  find_features(document):
    words =set (document)
    features  = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features 

#print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets =  [(find_features(rev), category) for (rev,category) in documents]

training_set = featuresets [:1900]
test_set = featuresets [1900:]
 
#### naive bayes ----> posterior  =  (prior occureneces x likelihood) / evidence

# classifier =  nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naivebayes.pickle","rb")
classifier=pickle.load (classifier_f)
classifier_f.close()


print ("Original Naive bayes alg accuracy percent: ", (nltk.classify.accuracy(classifier,test_set))*100)
classifier.show_most_informative_features(15)

#training here using sklearn
MNB_classifier  = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print ("MNB classifier accuracy percent: ", (nltk.classify.accuracy(MNB_classifier,test_set))*100)

# GaussianNB_classifier  = SklearnClassifier(GaussianNB())
# GaussianNB_classifier.train(training_set)
# print ("GaussianNB classifier accuracy percent: ", (nltk.classify.accuracy(GaussianNB_classifier,test_set))*100)

BernoulliNB_classifier  = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print ("BernoulliNB classifier accuracy percent: ", (nltk.classify.accuracy(BernoulliNB_classifier,test_set))*100)


#  LogisticRegression,SGDClassifier
#  SVC,LinearSVC,NuSVC


LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print ("LogisticRegression_classifier accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier,test_set))*100)

SGDClassifier_classifier  = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print ("SGDClassifier classifier accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier,test_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_set))*100)

LinearSVC_classifier  = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print ("LinearSVC classifier accuracy percent: ", (nltk.classify.accuracy(LinearSVC_classifier,test_set))*100)

NuSVC_classifier  = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print ("NuSVC classifier accuracy percent: ", (nltk.classify.accuracy(NuSVC_classifier,test_set))*100)


### video - 16  Combining algo with vote

In [None]:
#creating our own alogrithms 

import nltk
import random 
from nltk.corpus import movie_reviews
import pickle 
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC,LinearSVC,NuSVC


from nltk.classify import ClassifierI
from statistics import mode


####### defining class for adding voting classifier here ##################3
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


## used for traing and testing  features of document
documents  = [(list (movie_reviews.words(fileid )),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

## all words are in common list 
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words     = nltk.FreqDist(all_words)
word_features =  list (all_words.keys())[:3000]

def  find_features(document):
    words =set (document)
    features  = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features 


#print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets =  [(find_features(rev), category) for (rev,category) in documents]
training_set = featuresets [:1900]
test_set = featuresets [1900:]
 
#### naive bayes ----> posterior  =  (prior occureneces x likelihood) / evidence
# classifier =  nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle","rb")
classifier=pickle.load (classifier_f)
classifier_f.close()

print ("Original Naive bayes alg accuracy percent: ", (nltk.classify.accuracy(classifier,test_set))*100)
classifier.show_most_informative_features(15)






################## training here using sklearn ##############################3
MNB_classifier  = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print ("MNB classifier accuracy percent: ", (nltk.classify.accuracy(MNB_classifier,test_set))*100)

# GaussianNB_classifier  = SklearnClassifier(GaussianNB())
# GaussianNB_classifier.train(training_set)
# print ("GaussianNB classifier accuracy percent: ", (nltk.classify.accuracy(GaussianNB_classifier,test_set))*100)

BernoulliNB_classifier  = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print ("BernoulliNB classifier accuracy percent: ", (nltk.classify.accuracy(BernoulliNB_classifier,test_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print ("LogisticRegression_classifier accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier,test_set))*100)

SGDClassifier_classifier  = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print ("SGDClassifier classifier accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier,test_set))*100)

# SVC_classifier = SklearnClassifier(SVC())
# SVC_classifier.train(training_set)
# print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier  = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print ("LinearSVC classifier accuracy percent: ", (nltk.classify.accuracy(LinearSVC_classifier,test_set))*100)

NuSVC_classifier  = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print ("NuSVC classifier accuracy percent: ", (nltk.classify.accuracy(NuSVC_classifier,test_set))*100)



########### printing the voting classifier accuracy here ############################################ 
voted_classifier = VoteClassifier(classifier,
                                  LogisticRegression_classifier,
                                  SGDClassifier_classifier,
                                  LinearSVC_classifier,
                                  NuSVC_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier ) 


print ("Voted_classifier accuracy percent: ", (nltk.classify.accuracy(voted_classifier,test_set))*100)

print ("Classification: ",voted_classifier.classify(test_set[0][0]),"Confidence %: ",voted_classifier.confidence(test_set[0][0])*100)
print ("Classification: ",voted_classifier.classify(test_set[1][0]),"Confidence %: ",voted_classifier.confidence(test_set[1][0])*100)
print ("Classification: ",voted_classifier.classify(test_set[2][0]),"Confidence %: ",voted_classifier.confidence(test_set[2][0])*100)
print ("Classification: ",voted_classifier.classify(test_set[3][0]),"Confidence %: ",voted_classifier.confidence(test_set[3][0])*100)
print ("Classification: ",voted_classifier.classify(test_set[4][0]),"Confidence %: ",voted_classifier.confidence(test_set[4][0])*100)
print ("Classification: ",voted_classifier.classify(test_set[5][0]),"Confidence %: ",voted_classifier.confidence(test_set[5][0])*100)




###  Video - 17  Investigating bayes 

In [13]:
# import numpy as np

# #generating the random number size=100 
# random_number = np.random.randint(10, size=(1, 100))
# #for i in random_number: 
#     #print(i, end="")


# for i in random_number: 
#     #print(i, end="")
#     for num in i: 
#         # checking condition 
#         if num % 2 == 0: 
#            print( "0", end = "")
#         else:
#             print("1",end = "")



In [15]:
import string
import random

#Taking the input for the pattern of length 4 of where, odd = 1 and even = 0  
val = input("enter the 4 digit pattern 0 & 1 only  = ") 



#random string generating with size = 100
def Random_Generator(size=100, chars=string.digits):            
    return ''.join(random.choice(chars) for a in range(size))
s = Random_Generator()


input_number=list(s)
i=0
while i<len(input_number):
    input_number[i]=str(int(input_number[i])%2)
    i=i+1
    # print(i)
output=""
output=output.join(input_number)


############# Output  for analysis #################################
## To print the random string generating with size = 100
print("Random  " + s)  
## output for the required usecase 
print("Output  " + output)





 

            



enter the 4 digit pattern 0 & 1 only  = 1001
Random  9109162638538293828692270740214472503745984043032273593807005882684341408949657695003517577378586228
Output  1101100010110011000010010100010010101101100001010011111001001000000101000101011011001111111110100000
