In [6]:
#importing modules
import pandas as pd
import numpy as np
import regex as re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
#importing dataset
df = pd.read_csv("/content/data.csv", encoding="ISO-8859-1")
print(df.columns)

Index(['sentiment', 'text', 'user'], dtype='object')


In [9]:
#dropping useless columns
df = df.drop(["user"], axis=1)

In [10]:
#get the unique values of sentiment column
print(df["sentiment"].unique())

['positive' 'neutral' 'negative']


In [11]:
#get the probability of a tweet being positive, negative or neutral
prob_positive = len([tweet for tweet in df["sentiment"] if tweet == 'positive']) / len(df)
prob_neutral = len([tweet for tweet in df["sentiment"] if tweet == 'neutral']) / len(df)
prob_negative = len([tweet for tweet in df["sentiment"] if tweet == 'negative']) / len(df)
print(prob_positive, prob_negative, prob_neutral)

0.4069767441860465 0.3488372093023256 0.2441860465116279


In [12]:
#clean the tweets by removing the hashtags, mentions, non-english alphabets and other links
texts = []
for text in df["text"]:
  text = re.sub(r"(^RT\s+@.*:\s)", "", text)
  text = re.sub(r"https:.*", "", text)
  text = re.sub(r"(\s)[#@]+\w+", "", text) #removing all mentions and hashtags that occur in middle of tweets
  text = re.sub(r"([#@]+\w+\s)", "", text) #removing all mentions and hashtags that occur at the starting of tweets
  text = re.sub(r"[,.?!]*", "", text)
  text = re.sub(r"[^a-zA-Z\s']+", "", text) #removing words that contain characters other than english alphabets and spaces.
  texts.append(text)
df["text"] = texts
print(texts)

['The incredible reactions as Brandon Moreno became the first Mexicanborn champion in UFC history  ', 'CAN ANYBODY HEAR ME ', 'Guys how do u know she is a good rider ', 'We want to justice ', 'Farmers in Tsp Sagaing Division staged a sesame harvest strike to overthrow the military dictatorship  ', 'We are not Hindus Separate vellalars from hindu nWe are the real Vellalars ', "Saturn's icy moon Enceladus captured by the Cassini spacecraft The moon's surface is covered with fractures folds", 'And here is some more local news on power reconnections ', 'MT Studio said that back in LA to prepare for his new songcontent ', 'i wish one day you could see yourself with my eyes  ', 'Comedian Atif Muhammad Aziz jokes on Indian Army Ram Lala Cow meatnnNothing hilarious to make anyone laugh Audience', '', 'Someone in the product team is in a long distance relationship', 'what if I made you heart shaped gorditas aha unless ', 'Classic Tim ', "Today is The Queen's Official Birthday A military parade 

In [25]:
#merge the positive tweets together. Do the same for negative and neutral tweets also.
positive_tweets_list = []
negative_tweets_list = []
neutral_tweets_list = []
for (label, tweet) in zip(df["sentiment"], df["text"]):
  if label == 'positive':
    positive_tweets_list.append(tweet)
  elif label == 'negative':
    negative_tweets_list.append(tweet)
  else:
    neutral_tweets_list.append(tweet)

positive_tweets = " ".join(positive_tweets_list)
negative_tweets = " ".join(negative_tweets_list)
neutral_tweets = " ".join(neutral_tweets_list)

#Create a string that contains all the tweets too.
all_tweets = positive_tweets + " " + negative_tweets + " " + neutral_tweets

In [32]:
#define a function that returns the bag of words while removing stopwords and stemming the words as well.
#bag of words is dictionary where each word is stored with the no of times it appears in the text.
def bag_of_words(sentence):
  bag = {}
  ps = PorterStemmer()
  #sw = stopwords.words('english')
  for word in sentence.lower().split():
    #word = ps.stem(word)
    #if word in sw:
      #pass
    if word in bag:
      bag[word] += 1
    else:
      bag[word] = 1
  return bag

In [33]:
#create a bag of words for the negative, positive and neutral tweets separately.
negative_words = bag_of_words(negative_tweets)
positive_words = bag_of_words(positive_tweets)
neutral_words = bag_of_words(neutral_tweets)

#create a bag of words for all tweets combined too.
all_words = bag_of_words(all_tweets)

In [34]:
print("positive_words: ", positive_words)
print("negative_words: ", negative_words)
print("neutral_words: ", neutral_words)

positive_words:  {'the': 22, 'incredible': 1, 'reactions': 1, 'as': 1, 'brandon': 1, 'moreno': 1, 'became': 1, 'first': 1, 'mexicanborn': 1, 'champion': 1, 'in': 5, 'ufc': 1, 'history': 1, 'guys': 1, 'how': 1, 'do': 1, 'u': 1, 'know': 1, 'she': 2, 'is': 10, 'a': 11, 'good': 1, 'rider': 1, 'we': 2, 'are': 4, 'not': 1, 'hindus': 1, 'separate': 1, 'vellalars': 2, 'from': 2, 'hindu': 1, 'nwe': 1, 'real': 1, 'and': 8, 'here': 1, 'some': 1, 'more': 1, 'local': 2, 'news': 1, 'on': 2, 'power': 1, 'reconnections': 1, 'mt': 1, 'studio': 1, 'said': 1, 'that': 1, 'back': 1, 'la': 1, 'to': 5, 'prepare': 1, 'for': 4, 'his': 1, 'new': 2, 'songcontent': 1, 'classic': 1, 'tim': 1, 'today': 3, "queen's": 1, 'official': 1, 'birthday': 1, 'military': 1, 'parade': 1, 'held': 1, 'by': 4, 'taking': 1, 'place': 1, 'quadrangle': 1, 'of': 3, 'found': 1, 'puppies': 1, 'side': 1, 'road': 2, 'they': 2, 'need': 1, 'loving': 2, 'home': 2, 'simple': 1, 'retweet': 1, 'can': 1, 'find': 1, 'these': 1, 'babies': 1, 'veru

In [35]:
#predict the sentiment by calculating probabilities for each sentiment using naive bayes formula
def predict(sentence):
  positive_prob = 1
  negative_prob = 1
  neutral_prob = 1
  for word in sent.split():

    if word in positive_words:
      prob_word_is_positive = (positive_words[word] + 1)/ (sum(positive_words.values()) + 1) 
    else:
      prob_word_is_positive = 1 / (sum(positive_words.values()) +1)

    if word in all_words:
      prob_word = (all_words[word] + 1) / (sum(all_words.values()) +1) 
    else:
      prob_word = 1 / (sum(all_words.values()) +1)

    if word in negative_words:
      prob_word_is_negative = (negative_words[word] + 1) / (sum(negative_words.values()) +1)
    else:
      prob_word_is_negative = 1 / (sum(negative_words.values()) +1)

    if word in negative_words:
      prob_word_is_negative = (negative_words[word] + 1) / (sum(negative_words.values()) +1)
    else:
      prob_word_is_negative = 1 / (sum(negative_words.values()) +1)

    if word in neutral_words:
      prob_word_is_neutral = (neutral_words[word] + 1) / (sum(neutral_words.values()) +1)
    else:
      prob_word_is_neutral = 1 / (sum(neutral_words.values()) +1)

    positive_prob *= prob_word_is_positive * prob_positive / prob_word
    negative_prob *= prob_word_is_negative * prob_negative / prob_word
    neutral_prob *= prob_word_is_neutral * prob_neutral / prob_word
    print("positive: {0:.2f}%".format(positive_prob*100))
    print("negative: {0:.2f}%".format(negative_prob*100))
    print("neutral: {0:.2f}%".format(neutral_prob*100))
    if neutral_prob > 1:
      return "neutral"
    elif positive_prob >= negative_prob:
      return "positive"
    else:
      return "negative"

In [None]:
#predicting probabilities of tweet being positive or negative
while True:
  sent = input("Enter a sentence: ")
  if sent.lower()=="bye":
    break
  result = predict(sent)
  print("Sentence is: ", result)

Enter a sentence: hello
positive: 43.72%
negative: 54.77%
neutral: 111.49%
Sentence is:  neutral
Enter a sentence: this game is annoying
positive: 29.15%
negative: 62.60%
neutral: 21.24%
Sentence is:  negative
Enter a sentence: sachin is a legendary cricketer
positive: 87.44%
negative: 54.77%
neutral: 55.75%
Sentence is:  positive
