# Examine the Text Processing Techniques Utilizing NLTK

## Process

1) Downloading NLTK
2) Filtering out commonly used words
3) Stop Words
4) Text Lemmatizer
5) Text Joining
6) Tokenizer
7) Sequencing Text





In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ksrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ksrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ksrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Filtering out commonly used words

import pandas as pd

data = pd.read_csv("Hotel-rating-sentiments.csv")

top5 = ["room", "negative", "hotel", "staff"]
for x in top5:
    data["Customer Review"] = data["Customer Review"].astype(str).str.replace(x,"")

data.head(2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

data2 = data.copy()

In [None]:
# Stop words

import nltk
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

In [None]:
# Text Lemmatizer
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    words = remove_stopwords(words)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

X = data2["Customer Review"].values
X = [lemmatize_text(x) for x in X]

In [None]:
# Text Joining
data2["Customer Review"] = data2["Customer Review"].apply(lambda x:" ".join(token for token in x))
data2.head(2)

In [None]:
# Tokenizer
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [None]:
# Texts to Sequences

import matplotlib.pyplot as plt
import seaborn as sns

## Calculate the length of each sentence
length_dist = [len(x.split(" ")) for x in X]

## Set up the plot
sns.set(rc={'figure.figsize':(12,9)})
sns.set_palette("husl", 9)
plt.figure()

## Plot the histogram
sns.histplot(length_dist)

## Set up the axes
plt.gca().patch.set_visible(False)
plt.xlim(0,600)
plt.ylim(0,1200)

## Set up the title
plt.title("Sentence length distribution", fontsize=20)

## Show the plot
plt.show()

## Sequencing Text

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

# Transform the texts to sequences of integers
sequences = tokenizer.texts_to_sequences(X)

# Find the maximum length of a sequence in the list
max_length = max([len(seq) for seq in sequences])

# Find the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print("Vocabulary size: {}".format(vocab_size))
print("Max length of sentence: {}".format(max_length))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Padding the sequences
X = pad_sequences(sequences, padding='post', maxlen=600)

## Remapping Ratings

In [None]:
import numpy as np

labels = ['1', '2', '3', '4', '5']

y = data['Rating']

# Map ratings to range [0, 4] using a look-up table
lookup_table = np.array([0, 1, 2, 3, 4])
y = lookup_table[y - 1]