# Natural Language Processing

## Importing the libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [23]:
dataset = pd.read_csv("https://raw.githubusercontent.com/space-debris/natural_language_processing/main/Restaurant_Reviews.tsv", delimiter = "\t", quoting = 3)
# delimiter to tell that the diffrentiating factor is "tab" and not the default i.e ","
# quoting is to ignore the quotation marks

## Cleaning the texts

In [24]:
import re
import nltk
# help us download and ensemble the stop words, words that are not relevant of wether the review is positive or negative. a,an,the...
nltk.download('stopwords')
from nltk.corpus import stopwords
# importing the stop words we downloaded
from nltk.stem.porter import PorterStemmer
# importing the porterstemmer class from the porter submodule from the stem module from the nltk class which allows us to only take the root word.
# stemming converts words like loved to love. this is done to reduce the number of columns in sparse matrix
corpus = []
# contains all clean reviews
for i in range(0,1000):
  # we update "review" variable each time we do a type of cleaning
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  # to replace all punctuations with spaces. hence, only keeping the words. we use ^(NOT), this means we want to replace everthing that is not a-z & A-Z by space
  review = review.lower()
  # converts to lower case letters
  review = review.split()
  # this will split the reviews into words so stemming can be performed
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  # removing not from stopwords since it denotes negative and we want it to be present
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  # so we took one reivew iterated each word of if and added to review list, only those words which r not in stopwords, we removed not from stopwords.
  # hence not will not match from words in stopwords and will be allowed
  review = ' '.join(review)
  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
# fit selects the words and transform makes the columns corresponding to it

In [27]:
print(len(X[0]))

1566


In [28]:
# X contains 1566 columns ( words) which includes some of the words like steve which occur very rare and are not very significant in determining whether
# the review is positive or negative, so we remove those cols by changing the cv
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [29]:
print(len(X[0]))

1500


## Splitting the dataset into the Training set and Test set

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Naive Bayes model on the Training set

In [31]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [32]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73