In [1]:
import nltk
# nltk.download()

In [2]:
from nltk import sent_tokenize,word_tokenize

In [3]:
example_text='''
Hello mr. Smith, how are you? The weather is great and python is awesome. sky is clear blue.'''

# using simple split command won't help here.for sentence tokenisation if you split using '.' it would create problem at 'mr.'
# also spliting sentence in word is less effective and creates garbage data.

In [4]:
print(sent_tokenize(example_text))

['\nHello mr. Smith, how are you?', 'The weather is great and python is awesome.', 'sky is clear blue.']


In [5]:
print(word_tokenize(example_text))

['Hello', 'mr.', 'Smith', ',', 'how', 'are', 'you', '?', 'The', 'weather', 'is', 'great', 'and', 'python', 'is', 'awesome', '.', 'sky', 'is', 'clear', 'blue', '.']


In [6]:
from nltk.corpus import stopwords

In [7]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>


False

In [8]:
stop_words=set(stopwords.words('english'))
print(stop_words)

{"aren't", 'him', 'our', 'being', 'below', 'the', 'own', 'this', 'out', 'after', 'her', 'too', 'above', 'yourselves', 'to', 'does', 'why', 'couldn', "couldn't", 'that', 'or', 'weren', 'further', "you'll", 'was', 'ours', 'more', 'all', 'because', 'such', 'wouldn', 'isn', 'how', 'haven', 'once', 'when', 'hers', "don't", 'ain', 'doesn', "you're", 'itself', 'these', 'until', 'what', 'my', 'before', "that'll", 'ourselves', 'we', 've', 'between', "isn't", 'during', 'most', 'under', 'same', 'for', 'but', 'where', 'an', 'be', 'than', 'll', "mustn't", 'm', 'having', "shouldn't", 'your', 'at', 'there', 'as', 'wasn', 'doing', 'd', 'just', 'hasn', 'shan', 'themselves', 'will', 'have', 'its', "you'd", 'now', 'ma', 'herself', 'so', 'aren', 'did', 'by', 'any', "shan't", 'down', "wasn't", 'over', 'up', 'few', 'should', "doesn't", 'mustn', "wouldn't", 't', 'their', "you've", 'himself', 'some', 'theirs', 'they', 'mightn', "weren't", "haven't", 'with', 'myself', 'needn', 'them', 'yourself', 'who', 'in', 

In [9]:
words=word_tokenize(example_text)

In [10]:
filtered_words=[]
for w in words:
    if w not in stop_words:
        filtered_words.append(w)
print(filtered_words)

['Hello', 'mr.', 'Smith', ',', '?', 'The', 'weather', 'great', 'python', 'awesome', '.', 'sky', 'clear', 'blue', '.']


In [11]:
filtered_words=[w for w in words if w.lower() not in stop_words] # in one line
print(filtered_words)

['Hello', 'mr.', 'Smith', ',', '?', 'weather', 'great', 'python', 'awesome', '.', 'sky', 'clear', 'blue', '.']


In [12]:
from nltk.stem import PorterStemmer,SnowballStemmer

In [13]:
sample_words=['play','playing','played','player','playful','playfully']

In [14]:
ps=PorterStemmer()
for w in sample_words:
    print(ps.stem(w))

play
play
play
player
play
play


In [15]:
ss=SnowballStemmer('english')
for w in sample_words:
    print(ss.stem(w))

play
play
play
player
play
play


In [16]:
# nltk.download('wordnet')

In [17]:
# nltk.download('omw-1.4')

## Lemmatizer

In [18]:
from nltk.stem import WordNetLemmatizer

In [19]:
wnl=WordNetLemmatizer()

In [20]:
for w in sample_words:
    print(wnl.lemmatize(w))

play
playing
played
player
playful
playfully


It does not changed the words beacuse during lemmatization we have to pass Part of Speech(pos) in the argument.noun is a default pos in lemmatization. Here is a example.

In [21]:
for w in sample_words:
    print(wnl.lemmatize(w,pos="v"))

play
play
play
player
playful
playfully


In [22]:
print(wnl.lemmatize('cacti'))
print(wnl.lemmatize('cats'))
print(wnl.lemmatize('geese'))
print(wnl.lemmatize('better',"a"))
# unlike stemming it replaces the old word with new meaningful related word 

cactus
cat
goose
good


## Part of Speech tagging

In [23]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [24]:
train_text=state_union.raw("2005-GWBush.txt")
sample_text=state_union.raw("2006-GWBush.txt")

In [25]:
print(train_text)

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq. (Applause.) 

Two weeks ago, I stood on the steps of this Capitol and renewed the commitment of our nation to the guiding ideal of liberty for all. This evening I will set forth policies to advance that ideal at home and around the world. 

Tonight, with a healthy, growing economy, with more Americans going back to work, with our nation an active force for good in the world -- the state of our union is confident and strong. (Applause.) 

Our generati

In [26]:
custom_sent_tokenizer=PunktSentenceTokenizer(train_text)
tokenized=custom_sent_tokenizer.tokenize(sample_text)

In [27]:
def process_content():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

In [28]:
process_content()[1:3]

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

[('The', 'DT'), ('road', 'NN'), ('of', 'IN'), ('victory', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('road', 'NN'), ('that', 'WDT'), ('will', 'MD'), ('take', 'VB'), ('our', 'PRP$'), ('troops', 'NNS'), ('home', 'NN'), ('.', '.')]
[('As', 'IN'), ('we', 'PRP'), ('make', 'VBP'), ('progress', 'NN'), ('on', 'IN'), ('the', 'DT'), ('ground', 'NN'), (',', ','), ('and', 'CC'), ('Iraqi', 'NNP'), ('forces', 'NNS'), ('increasingly', 'RB'), ('take', 'VBP'), ('the', 'DT'), ('lead', 'NN'), (',', ','), ('we', 'PRP'), ('should', 'MD'), ('be', 'VB'), ('able', 'JJ'), ('to', 'TO'), ('further', 'JJ'), ('decrease', 'VB'), ('our', 'PRP$'), ('troop', 'NN'), ('levels', 'NNS'), ('--', ':'), ('but', 'CC'), ('those', 'DT'), ('decisions', 'NNS'), ('will', 'MD'), ('be', 'VB'), ('made', 'VBN'), ('by', 'IN'), ('our', 'PRP$'), ('military', 'JJ'), ('commanders', 'NNS'), (',', ','), ('not', 'RB'), ('by', 'IN'), ('politicians', 'NNS'), ('in', 'IN'), ('Washington', 'NNP'), (',', ','), ('D.C', 'NNP'), ('.', '.')]
[('(', '('), ('

[('It', 'PRP'), ('is', 'VBZ'), ('said', 'VBD'), ('that', 'IN'), ('prior', 'JJ'), ('to', 'TO'), ('the', 'DT'), ('attacks', 'NNS'), ('of', 'IN'), ('September', 'NNP'), ('the', 'DT'), ('11th', 'CD'), (',', ','), ('our', 'PRP$'), ('government', 'NN'), ('failed', 'VBD'), ('to', 'TO'), ('connect', 'VB'), ('the', 'DT'), ('dots', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('conspiracy', 'NN'), ('.', '.')]
[('We', 'PRP'), ('now', 'RB'), ('know', 'VBP'), ('that', 'IN'), ('two', 'CD'), ('of', 'IN'), ('the', 'DT'), ('hijackers', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('placed', 'VBD'), ('telephone', 'NN'), ('calls', 'NNS'), ('to', 'TO'), ('al', 'VB'), ('Qaeda', 'NNP'), ('operatives', 'VBZ'), ('overseas', 'RB'), ('.', '.')]
[('But', 'CC'), ('we', 'PRP'), ('did', 'VBD'), ('not', 'RB'), ('know', 'VB'), ('about', 'IN'), ('their', 'PRP$'), ('plans', 'NNS'), ('until', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('too', 'RB'), ('late', 'JJ'), ('.', '.')]
[('So', 'RB'), ('to', 'TO

[('The', 'DT'), ('retirement', 'NN'), ('of', 'IN'), ('the', 'DT'), ('baby', 'NN'), ('boom', 'NN'), ('generation', 'NN'), ('will', 'MD'), ('put', 'VB'), ('unprecedented', 'JJ'), ('strains', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('federal', 'JJ'), ('government', 'NN'), ('.', '.')]
[('By', 'IN'), ('2030', 'CD'), (',', ','), ('spending', 'VBG'), ('for', 'IN'), ('Social', 'NNP'), ('Security', 'NNP'), (',', ','), ('Medicare', 'NNP'), ('and', 'CC'), ('Medicaid', 'NNP'), ('alone', 'RB'), ('will', 'MD'), ('be', 'VB'), ('almost', 'RB'), ('60', 'CD'), ('percent', 'NN'), ('of', 'IN'), ('the', 'DT'), ('entire', 'JJ'), ('federal', 'JJ'), ('budget', 'NN'), ('.', '.')]
[('And', 'CC'), ('that', 'DT'), ('will', 'MD'), ('present', 'VB'), ('future', 'JJ'), ('Congresses', 'NNS'), ('with', 'IN'), ('impossible', 'JJ'), ('choices', 'NNS'), ('--', ':'), ('staggering', 'VBG'), ('tax', 'NN'), ('increases', 'NNS'), (',', ','), ('immense', 'JJ'), ('deficits', 'NNS'), (',', ','), ('or', 'CC'), ('deep', 'JJ'), ('cuts

[('There', 'EX'), ('are', 'VBP'), ('fewer', 'JJR'), ('abortions', 'NNS'), ('in', 'IN'), ('America', 'NNP'), ('than', 'IN'), ('at', 'IN'), ('any', 'DT'), ('point', 'NN'), ('in', 'IN'), ('the', 'DT'), ('last', 'JJ'), ('three', 'CD'), ('decades', 'NNS'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('number', 'NN'), ('of', 'IN'), ('children', 'NNS'), ('born', 'VBN'), ('to', 'TO'), ('teenage', 'VB'), ('mothers', 'NNS'), ('has', 'VBZ'), ('been', 'VBN'), ('falling', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('dozen', 'NN'), ('years', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('row', 'NN'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]
[('These', 'DT'), ('gains', 'NNS'), ('are', 'VBP'), ('evidence', 'NN'), ('of', 'IN'), ('a', 'DT'), ('quiet', 'JJ'), ('transformation', 'NN'), ('--', ':'), ('a', 'DT'), ('revolution', 'NN'), ('of', 'IN'), ('conscience', 'NN'), (',', ','), ('in', 'IN'), ('which', 'WDT'), ('a', 'DT'), ('rising', 'VBG'), ('generation', 'NN'), ('is', 'VBZ'), ('finding', 'VBG

TypeError: 'NoneType' object is not subscriptable

## chunking¶

In [None]:
def chunk_it():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            
            chunkgram=r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} """ 
            ChunkParser=nltk.RegexpParser(chunkgram)
            chunked=ChunkParser.parse(tagged)
            chunked.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
# chunk_it()

## chinking

In [None]:
''' chinking is opposite of chunking in which we exclude the part we don't want from the chunks. '<.*>+' chunks everything 
chinking expression is defined by } { braces'''

In [None]:
def chink_it():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            # in chunkgram expression you can use any meaningful word in place og chunk
            chunkgram=r"""Chunk: {<.*>+} 
                        }<VB.?|IN|DT>+{""" 
            ChunkParser=nltk.RegexpParser(chunkgram)
            chunked=ChunkParser.parse(tagged)
            chunked.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
# chink_it()

## named entity recognition

In [None]:
def named_entity():
    try:
        for i in tokenized:
            words=nltk.word_tokenize(i)
            tagged=nltk.pos_tag(words)
            
            namedEnt=nltk.ne_chunk(tagged)
            namedEnt.draw()
            
    except Exception as e:
        print(str(e))

In [None]:
# named_entity()

## accessing the corpora

In [29]:
nltk.__file__

'C:\\Users\\DELL\\Anaconda3\\lib\\site-packages\\nltk\\__init__.py'

In [30]:
from nltk.corpus import indian

In [31]:
print(indian.raw("README"))

Indian Language POS-Tagged Corpus
Collected by A Kumaran, Microsoft Research, India

Distributed with permission

Contents:
- Bangla: IIT Kharagpur
- Hindi: Microsoft Research India
- Marathi: IIT Bombay
- Telugu: IIIT Hyderabad


// Tagset developed at IIIT - Hyderabad after consultations with      
// several institutions through two workshops. 
 
 
A Part of Speech Tagger for Indian 
Languages (POS tagger) 
 
 
Introduction: 
 
The significance of large annotated corpora in the present day NLP is 
widely known. Annotated corpora serve as an important tool for 
investigators of natural language processing, speech recognition and 
other related areas. It proves to be a basic building block for 
constructing statistical models for automatic processing of natural 
languages.  
 
Many such corpora are available for languages across the world and have 
proved to be a useful step towards natural language processing. 
 
Looking at the scenario for Indian languages, not much work has been 
c

In [32]:
print(indian.raw("marathi.pos"))

<Corpora type="Monolingual-POS-TAGGED" Language="MARATHI">
<Sentence id=1>
''_SYM सनातनवाद्यांनी_NN व_CC प्रतिगाम्यांनी_NN समाज_NN रसातळाला_NN नेला_VM असताना_VAUX या_DEM अंधारात_NN बाळशास्त्री_NNPC जांभेकर_NNP यांनी_PRP 'दर्पण'च्या_NNP माध्यमातून_NN पहिली_QO ज्ञानज्योत_NN तेववली_VM ,_SYM ''_SYM असे_DEM प्रतिपादन_NN नटसम्राट_NNPC प्रभाकर_NNPC पणशीकर_NNP यांनी_PRP केले_VM ._SYM 
</Sentence>
<Sentence id=2>
दर्पणकार_JJ बाळशास्त्री_NNPC जांभेकर_NNP यांच्या_PRP १९५व्या_QC जयंतीनिमित्त_NN महाराष्ट्र_NNPC संपादक_NNPC परिषद_NNP व_CC सिंधुदुर्ग_NNPC जिल्हा_NNPC मराठी_NNPC पत्रकार_NNPC संघाच्या_NNP वतीने_NN तसेच_PRP महाराष्ट्र_NNPC जर्नलिस्ट_NNPC फाउंडेशन_NNP व_CC महाराष्ट्र_NNPC ग्रामीण_NNPC पत्रकार_NNPC संघाच्या_NNP सहभागाने_NN अभिवादन_NN कार्यक्रम_NN आयोजित_JJ केला_VM होता_VAUX ._SYM 
</Sentence>
<Sentence id=3>
महाराष्ट्र_NNPC संपादक_NNPC परिषदेचे_NNP कार्याध्यक्ष_NN यशवंत_NNPC पाध्ये_NNP ,_SYM जिल्हा_NNPC पत्रकार_NNPC संघाचे_NNP अध्यक्ष_NN शशी_NNPC सावंत_NNP यावेळी_PRP उपस्थित_JJ हो

## wordnet

In [33]:
from nltk.corpus import wordnet

In [34]:
syns =wordnet.synsets("program")

In [35]:
print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [36]:
#synset
print(syns[0].name())

plan.n.01


In [37]:
#just a word
print(syns[5].lemmas())
print(syns[0].lemmas()[0].name())

[Lemma('course_of_study.n.01.course_of_study'), Lemma('course_of_study.n.01.program'), Lemma('course_of_study.n.01.programme'), Lemma('course_of_study.n.01.curriculum'), Lemma('course_of_study.n.01.syllabus')]
plan


In [38]:
# definition
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [39]:
# example
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [40]:
synonyms=[]
antonyms=[]
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(set(synonyms),set(antonyms),sep='\n\n')

{'good', 'honorable', 'practiced', 'dear', 'dependable', 'adept', 'proficient', 'trade_good', 'safe', 'well', 'unspoilt', 'secure', 'skillful', 'salutary', 'ripe', 'serious', 'upright', 'in_effect', 'goodness', 'effective', 'near', 'sound', 'honest', 'skilful', 'thoroughly', 'soundly', 'commodity', 'just', 'estimable', 'full', 'respectable', 'in_force', 'beneficial', 'undecomposed', 'right', 'expert', 'unspoiled'}

{'evilness', 'badness', 'evil', 'ill', 'bad'}


In [41]:
# similarity check
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")
print(w1.wup_similarity(w2))

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
print(w1.wup_similarity(w2))

0.9090909090909091
0.6956521739130435
0.32


## text classification

In [42]:
import random
from nltk.corpus import movie_reviews

In [43]:
documents=[(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
          for fileid in movie_reviews.fileids(category)]

In [44]:
random.shuffle(documents)

In [45]:
print(documents[1])

(['synopsis', ':', 'a', 'novelist', 'struggling', 'with', 'his', 'latest', 'work', 'buys', 'a', 'weird', 'brain', 'with', 'a', 'protruding', 'eyeball', 'encased', 'in', 'a', 'jar', '.', 'the', 'brain', 'exerts', 'its', 'evil', 'influence', 'upon', 'the', 'novelist', 'and', 'his', 'secretary', ',', 'while', 'his', 'wife', 'disapproves', '.', 'meanwhile', ',', 'a', 'loan', 'shark', 'in', 'need', 'of', 'a', 'shave', 'tries', 'to', 'leave', 'his', 'profession', '.', 'comments', ':', 'why', 'is', 'this', 'movie', 'called', 'possessed', 'by', 'the', 'night', '?', 'it', "'", 's', 'hard', 'to', 'speculate', '.', 'most', 'of', 'this', 'film', 'takes', 'place', 'during', 'the', 'day', ',', 'and', 'the', 'only', 'thing', 'possessing', 'anyone', 'is', 'an', 'icky', ',', 'pulsating', ',', 'bubbling', 'brain', 'thingie', 'in', 'a', 'jar', '.', 'in', 'case', 'you', 'haven', "'", 't', 'picked', 'up', 'on', 'this', 'yet', ',', 'possessed', 'by', 'the', 'night', 'is', 'a', 'bottom', '-', 'of', '-', 'the

In [46]:
all_words=[]
for w in movie_reviews.words():
    all_words.append(w.lower())

In [47]:
freq=nltk.FreqDist(all_words)
print(freq)

<FreqDist with 39768 samples and 1583820 outcomes>


In [48]:
print(freq.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [49]:
print(freq["stupid"])

253


In [50]:
word_features=list(freq.keys())[:3000]

In [51]:
def find_features(document):
    words=set(document)
    features={}
    for w in word_features:
        features[w]= (w in words)
    return features

In [52]:
print(find_features(movie_reviews.words('neg/cv000_29416.txt')))



In [53]:
featuresets=[(find_features(rev),category) for (rev,category) in documents]

In [54]:
len(featuresets)

2000

## modelling

In [55]:
training_set=featuresets[:1800]
testing_set=featuresets[1800:]

In [56]:
classifier=nltk.NaiveBayesClassifier.train(training_set)
print("Naive bayes classifier accuracy: ", nltk.classify.accuracy(classifier, testing_set)*100)
classifier.show_most_informative_features(15)

Naive bayes classifier accuracy:  84.0
Most Informative Features
                  annual = True              pos : neg    =      9.2 : 1.0
                 frances = True              pos : neg    =      9.2 : 1.0
                   sucks = True              neg : pos    =      8.8 : 1.0
               unlikable = True              neg : pos    =      8.2 : 1.0
                  welles = True              neg : pos    =      8.2 : 1.0
             silverstone = True              neg : pos    =      7.5 : 1.0
                  shoddy = True              neg : pos    =      6.9 : 1.0
           unimaginative = True              neg : pos    =      6.9 : 1.0
                 cunning = True              pos : neg    =      6.4 : 1.0
                 idiotic = True              neg : pos    =      6.4 : 1.0
                   kudos = True              pos : neg    =      6.3 : 1.0
                 kidding = True              neg : pos    =      6.2 : 1.0
                    mena = True    

## pickling

In [57]:
# import pickle

In [58]:
# save_classifier=open("naivebayes.pickle","wb")
# pickle.dump(classifier,save_classifier)
# save_classifier.close()

In [59]:
# classifier_f = open("naivebayes.pickle","rb")
# classifier=pickle.load(classifier_f)
# classifier_f.close()
# classifier=nltk.NaiveBayesClassifier.train(training_set)
# print("Naive bayes classifier accuracy: ", nltk.classify.accuracy(classifier, testing_set)*100)
# classifier.show_most_informative_features(15)

NameError: name 'pickle' is not defined

## Using Sklearn algorithms

In [60]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [61]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC,NuSVC,LinearSVC

In [62]:
MNB_classifier=SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percentage : ", nltk.classify.accuracy(MNB_classifier, testing_set)*100)

MultinomialNB accuracy percentage :  83.5


In [63]:
BNB_classifier=SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percentage : ", nltk.classify.accuracy(BNB_classifier, testing_set)*100)

BernoulliNB accuracy percentage :  84.0


In [64]:
LogisticRegression_classifier=SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression accuracy percentage : ", nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100)

SGDClassifier_classifier=SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier accuracy percentage : ", nltk.classify.accuracy(SGDClassifier_classifier, testing_set)*100)

SVC_classifier=SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC accuracy percentage : ", nltk.classify.accuracy(SVC_classifier, testing_set)*100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression accuracy percentage :  79.0
SGDClassifier accuracy percentage :  76.5
SVC accuracy percentage :  84.0


In [65]:
NuSVC,LinearSVC
NuSVC_classifier=SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC accuracy percentage : ", nltk.classify.accuracy(NuSVC_classifier, testing_set)*100)

LinearSVC_classifier=SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC accuracy percentage : ", nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100)

NuSVC accuracy percentage :  82.5
LinearSVC accuracy percentage :  77.0


## voting system for best accuracy and reliablity

In [75]:
from nltk.classify import ClassifierI
from statistics import mode

ImportError: cannot import name 'multimode' from 'statistics' (C:\Users\DELL\Anaconda3\lib\statistics.py)

In [69]:
class Voteclassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers=classifiers
        
    def classify(self, features):
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidance(self, features):
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
        confi=(votes.count(mode(votes))/len(votes))
        return confi

In [76]:
voted_classifier= Voteclassifier(classifier,
                                 MNB_classifier,
                                 BNB_classifier,
                                 LogisticRegression_classifier,
                                 SGDClassifier_classifier,
                                 NuSVC_classifier,LinearSVC_classifier)

In [77]:
print("voted_classifier accuracy percentage : ", nltk.classify.accuracy(voted_classifier, testing_set)*100)

voted_classifier accuracy percentage :  83.5


In [82]:
print('classification: ', voted_classifier.classify(testing_set[0][0]) , 'confidance % : ', voted_classifier.confidance(testing_set[0][0]))

classification:  neg confidance % :  1.0


In [83]:
print('classification: ', voted_classifier.classify(testing_set[1][0]) , 'confidance % : ', voted_classifier.confidance(testing_set[1][0]))
print('classification: ', voted_classifier.classify(testing_set[2][0]) , 'confidance % : ', voted_classifier.confidance(testing_set[2][0]))
print('classification: ', voted_classifier.classify(testing_set[3][0]) , 'confidance % : ', voted_classifier.confidance(testing_set[3][0]))
print('classification: ', voted_classifier.classify(testing_set[4][0]) , 'confidance % : ', voted_classifier.confidance(testing_set[4][0]))
print('classification: ', voted_classifier.classify(testing_set[5][0]) , 'confidance % : ', voted_classifier.confidance(testing_set[5][0]))

classification:  pos confidance % :  1.0
classification:  pos confidance % :  1.0
classification:  pos confidance % :  1.0
classification:  pos confidance % :  1.0
classification:  neg confidance % :  0.5714285714285714


## check for bias by not shuffling data during text classification step

In [86]:
# first 1000 data is negative review and next 1000 is positive test data for 100 positive and negative data