<a href="https://colab.research.google.com/github/rahulkhankar/SelfProject/blob/master/Sentiment_analysis_using_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk

In [5]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [6]:
from nltk.corpus import twitter_samples

This will import three datasets from NLTK that contain various tweets to train and test the model:

negative_tweets.json: 5000 tweets with negative sentiments 

positive_tweets.json: 5000 tweets with positive sentiments 

tweets.20150430-223406.json: 20000 tweets with no sentiments

In [7]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [8]:
#positive_tweets[0]

In [9]:
#negative_tweets[0]

In [10]:
#nltk.download('punkt')

The punkt module is a pre-trained model that will help tokenize words and sentences.

# Tokenization

In [11]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [12]:
print(tweet_tokens[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


# Normalization

In [13]:
#nltk.download('wordnet')

In [14]:
#nltk.download('averaged_perceptron_tagger')

wordnet is a lexical database for the English language that helps the script determine the base word

averaged_perceptron_tagger resource to determine the context of a word in a sentence.

In [15]:
from nltk.tag import pos_tag

In [33]:
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


pos_tag function provide a list of tokens as an argument to get the tags

CC	Coordinating conjunction

CD	Cardinal number

DT	Determiner

EX	Existential there

FW	Foreign word

IN	Preposition or subordinating conjunction

JJ	Adjective

JJR	Adjective, comparative

JJS	Adjective, superlative

1LS	List item marker

MD	Modal

NN	Noun, singular or mass

NNS	Noun, plural

NNP	Proper noun, singular

NNPS	Proper noun, plural

PDT	Predeterminer

POS	Possessive ending

PRP	Personal pronoun

PRP$	Possessive pronoun

RB	Adverb

RBR	Adverb, comparative

RBS	Adverb, superlative

RP	Particle

SYM	Symbol

TO	to

UH	Interjection

VB	Verb, base form

VBD	Verb, past tense

VBG	Verb, gerund or present participle

VBN	Verb, past participle

VBP	Verb, non-3rd person singular present

VBZ	Verb, 3rd person singular present

WDT	Wh-determiner

WP	Wh-pronoun

WP$	Possessive wh-pronoun

WRB	Wh-adverb

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

Sometimes, the same word can have a multiple lemmas based on the meaning / context. hence we will also incorporate part of speech when lemmatizing sentence

In [None]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return (lemmatized_sentence)

print(lemmatize_sentence(tweet_tokens[0]))

# Removing Noise from the Data

Hyperlinks, Twitter usernames are preceded by a @ symbol, Punctuation and special characters
        

In [17]:
import re, string

In [35]:
from nltk.stem import WordNetLemmatizer 

In [36]:
def remove_noise(tweet_tokens, stop_words = ()):
    
    cleaned_tokens = []
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)  #This is for Hyperlinks
        token = re.sub("(@[A-Za-z0-9_]+)","", token)               #This is for usernames staring with @
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:  #remove punctuation using the library string
            cleaned_tokens.append(token.lower())
        
    return (cleaned_tokens)

In [37]:
#nltk.download('stopwords')

In [38]:
#from nltk.corpus import stopwords
#stop_words = stopwords.words('english')

In [39]:
#print(stop_words)

In [43]:
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [44]:
stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before','to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than','s', 't', 'can', 'will', 'just', 'don', "don't", 'should','now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'ma', "weren't", 'won']

In [45]:
#nltk.download('averaged_perceptron_tagger')

In [46]:
print(remove_noise(tweet_tokens[0], stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [47]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

In [48]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

In [49]:
for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [50]:
print('Original:',positive_tweet_tokens[500])
print('\n')
print('Cleaned:',positive_cleaned_tokens_list[500])

Original: ['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']


Cleaned: ['dang', 'rad', '#fanart', ':d']


In [51]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token



In [52]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [53]:
all_neg_words = get_all_words(negative_cleaned_tokens_list)

In [54]:
positive_words=[]
for i in all_pos_words:
    positive_words.append(i)

In [55]:
negative_words=[]
for i in all_neg_words:
    negative_words.append(i)

In [56]:
positive_words

['#followfriday',
 'top',
 'engage',
 'member',
 'community',
 'week',
 ':)',
 'hey',
 'james',
 'odd',
 ':/',
 'please',
 'call',
 'contact',
 'centre',
 '02392441234',
 'able',
 'assist',
 ':)',
 'many',
 'thanks',
 'listen',
 'last',
 'night',
 ':)',
 'bleed',
 'amazing',
 'track',
 'scotland',
 'congrats',
 ':)',
 'yeaaaah',
 'yippppy',
 'accnt',
 'verify',
 'rqst',
 'succeed',
 'get',
 'blue',
 'tick',
 'mark',
 'fb',
 'profile',
 ':)',
 '15',
 'day',
 'one',
 'irresistible',
 ':)',
 '#flipkartfashionfriday',
 'like',
 'keep',
 'lovely',
 'customer',
 'wait',
 'long',
 'hope',
 'enjoy',
 'happy',
 'friday',
 'lwwf',
 ':)',
 'second',
 'thought',
 '’',
 'not',
 'enough',
 'time',
 'dd',
 ':)',
 'new',
 'short',
 'enter',
 'system',
 'sheep',
 'must',
 'buy',
 'jgh',
 'go',
 'bayan',
 ':d',
 'bye',
 'act',
 'mischievousness',
 'call',
 'etl',
 'layer',
 'in-house',
 'warehouse',
 'app',
 'katamari',
 'well',
 '…',
 'name',
 'imply',
 ':p',
 '#followfriday',
 'top',
 'influencers',
 

In [57]:
negative_words

['hopeless',
 'tmr',
 ':(',
 'everything',
 'kid',
 'section',
 'ikea',
 'cute',
 'shame',
 "i'm",
 'nearly',
 '19',
 '2',
 'month',
 ':(',
 'heart',
 'slide',
 'waste',
 'basket',
 ':(',
 '“',
 'hate',
 'japanese',
 'call',
 'ban',
 ':(',
 ':(',
 '”',
 'too',
 'dang',
 'start',
 'next',
 'week',
 'work',
 ':(',
 'oh',
 'god',
 'baby',
 'face',
 ':(',
 'make',
 'smile',
 ':(',
 'work',
 'neighbour',
 'motor',
 'asked',
 'say',
 'hat',
 'update',
 'search',
 ':(',
 ':(',
 'sialan',
 ':(',
 'athabasca',
 'glacier',
 '#1948',
 ':-(',
 '#athabasca',
 '#glacier',
 '#jasper',
 '#jaspernationalpark',
 '#alberta',
 '#explorealberta',
 '…',
 'really',
 'good',
 'g',
 'idea',
 "i'm",
 'never',
 'go',
 'meet',
 ':(',
 'mare',
 'ivan',
 ':(',
 'happy',
 'trip',
 'keep',
 'safe',
 'see',
 'soon',
 ':(',
 "i'm",
 'tire',
 'hahahah',
 ':(',
 'knee',
 'replacement',
 'get',
 'day',
 ':-(',
 'ouch',
 'relate',
 'sweet',
 'n',
 'sour',
 'kind',
 'bi-polar',
 'people',
 'life',
 '...',
 'cuz',
 'life',
 

which are the most common words

In [58]:
from nltk import FreqDist

In [59]:
freq_dist_pos = FreqDist(positive_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [60]:
freq_dist_pos = FreqDist(negative_words)
print(freq_dist_pos.most_common(10))

[(':(', 4585), (':-(', 501), ("i'm", 343), ('...', 332), ('get', 325), ('not', 300), ('miss', 291), ('go', 275), ('please', 275), ('want', 246)]


# Preparing Data for the Model

In [61]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token,True] for token in tweet_tokens)

In [62]:
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)

In [63]:
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

 prepare the data for training the NaiveBayesClassifier class

In [64]:
import random

In [65]:
positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]

In [68]:
#positive_dataset

In [67]:
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

In [None]:
#negative_dataset

In [69]:
dataset = positive_dataset + negative_dataset

In [70]:
random.shuffle(dataset)

In [71]:
train_data = dataset[:7000]
test_data = dataset[7000:]

# Building and Testing the Model

In [72]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [73]:
classifier = NaiveBayesClassifier.train(train_data)

In [74]:
print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9953333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2047.3 : 1.0
                      :) = True           Positi : Negati =   1671.5 : 1.0
                  arrive = True           Positi : Negati =     33.4 : 1.0
                     sad = True           Negati : Positi =     33.4 : 1.0
                follower = True           Positi : Negati =     22.9 : 1.0
              bestfriend = True           Positi : Negati =     19.9 : 1.0
                     bam = True           Positi : Negati =     18.6 : 1.0
                     x15 = True           Negati : Positi =     17.4 : 1.0
                    cool = True           Positi : Negati =     17.2 : 1.0
                     ugh = True           Negati : Positi =     16.1 : 1.0
None


# Check

In [75]:
from nltk.tokenize import word_tokenize

In [76]:
custom_tweet = "This is worst movie"

In [81]:
custom_tokens = word_tokenize(custom_tweet)
custom_tokens

['This', 'is', 'worst', 'movie']

In [82]:
lemmatizer = WordNetLemmatizer()
lemmatized_predict = []
for word, tag in pos_tag(custom_tokens):
    word=word.lower()
    if tag.startswith('NN'):
        pos = 'n'
    elif tag.startswith('VB'):
        pos = 'v'
    else:
        pos = 'a'
    lemmatized_predict.append(lemmatizer.lemmatize(word, pos))


In [83]:
lemmatized_predict

['this', 'be', 'bad', 'movie']

In [84]:
print(classifier.classify(dict([token, True] for token in lemmatized_predict)))

Negative
