In [40]:
X_train = [
  "This was awesome an awesome movie",
  "Great movie! I liked it a lot",
  "Happy Ending! awesome acting by the hero",
  "loved it! truly great",
  "bad not up to the mark",
  "could have been better",
  "Surely a Disappointing movie"
]

y_train = [1,1,1,1,0,0,0] # 1 - Positive, 0 - Negative class

In [42]:
X_train

['This was awesome an awesome movie',
 'Great movie! I liked it a lot',
 'Happy Ending! awesome acting by the hero',
 'loved it! truly great',
 'bad not up to the mark',
 'could have been better',
 'Surely a Disappointing movie']

# Data cleaning

In [43]:
from nltk.tokenize import RegexpTokenizer

In [45]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [47]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
tokenizer = RegexpTokenizer(r"\w+")
en_stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

In [54]:
def getCleanedText(text):
  text = text.lower()

  # tokenize
  tokens = tokenizer.tokenize(text)
  new_tokens = [token for token in tokens if token not in en_stopwords]

  stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens]

  clean_text = " ".join(stemmed_tokens)

  return clean_text

In [77]:
# data to be tested on

X_test = [
  "I was happy & happy and I loved the acting in the movie",
  "The movie I saw was bad"
]

# cleaning the train and test data

X_clean = [getCleanedText(i) for i in X_train]
Xt_clean = [getCleanedText(i) for i in X_test]

In [78]:
X_clean

['awesom awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad mark',
 'could better',
 'sure disappoint movi']

# Vectorization

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [80]:
cv = CountVectorizer(ngram_range=(1,2))

In [81]:
X_vec = cv.fit_transform(X_clean).toarray()

In [82]:
X_vec

array([[0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0]])

In [83]:
print(cv.get_feature_names_out())

['act' 'act hero' 'awesom' 'awesom act' 'awesom awesom' 'awesom movi'
 'bad' 'bad mark' 'better' 'could' 'could better' 'disappoint'
 'disappoint movi' 'end' 'end awesom' 'great' 'great movi' 'happi'
 'happi end' 'hero' 'like' 'like lot' 'lot' 'love' 'love truli' 'mark'
 'movi' 'movi like' 'sure' 'sure disappoint' 'truli' 'truli great']


In [84]:
Xt_vect = cv.transform(Xt_clean).toarray()

# Multinomial Naive Bayes

In [85]:
# importing Multinomial Naive Bayes Model

from sklearn.naive_bayes import MultinomialNB

In [86]:
# creating the instance of the model

mn = MultinomialNB()

In [87]:
# fitting data to the model

mn.fit(X_vec, y_train)

MultinomialNB()

In [88]:
# predicting sentiments on the test data

y_pred = mn.predict(Xt_vect)

In [89]:
# viewing the predctions

y_pred

array([1, 0])