## Implementation of NLP text processing using **pandas**

In [1]:
import re
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/priyesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv("../../data/hotel-reviews.csv")

In [4]:
data.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38932 entries, 0 to 38931
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User_ID       38932 non-null  object
 1   Description   38932 non-null  object
 2   Browser_Used  38932 non-null  object
 3   Device_Used   38932 non-null  object
 4   Is_Response   38932 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


In [6]:
data.describe()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
count,38932,38932,38932,38932,38932
unique,38932,38932,11,3,2
top,id49257,The Westin is a wonderfully restored grande da...,Firefox,Desktop,happy
freq,1,1,7367,15026,26521


In [7]:
data["Description"][0]

"The room was kind of clean but had a VERY strong smell of dogs. Generally below average but ok for a overnight stay if you're not too fussy. Would consider staying again if the price was right. Breakfast was free and just about better than nothing."

In [8]:
# creating a dataframe
df = pd.DataFrame({"reviews": data["Description"]})

In [9]:
df.head()

Unnamed: 0,reviews
0,The room was kind of clean but had a VERY stro...
1,I stayed at the Crown Plaza April -- - April -...
2,I booked this hotel through Hotwire at the low...
3,Stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...


In [10]:
df.describe()

Unnamed: 0,reviews
count,38932
unique,38932
top,The Westin is a wonderfully restored grande da...
freq,1


In [11]:
df.dtypes

reviews    object
dtype: object

In [12]:
# lowercase
df["lowercase_reviews"] = df["reviews"].str.lower()

In [13]:
df

Unnamed: 0,reviews,lowercase_reviews
0,The room was kind of clean but had a VERY stro...,the room was kind of clean but had a very stro...
1,I stayed at the Crown Plaza April -- - April -...,i stayed at the crown plaza april -- - april -...
2,I booked this hotel through Hotwire at the low...,i booked this hotel through hotwire at the low...
3,Stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...
...,...,...
38927,We arrived late at night and walked in to a ch...,we arrived late at night and walked in to a ch...
38928,The only positive impression is location and p...,the only positive impression is location and p...
38929,Traveling with friends for shopping and a show...,traveling with friends for shopping and a show...
38930,The experience was just ok. We paid extra for ...,the experience was just ok. we paid extra for ...


### Remove Stopwords

In [14]:
en_stopwords = stopwords.words("english")

In [15]:
# remove stopwords
df["remove_stopwords_reviews"] = df["lowercase_reviews"].apply(lambda x: ' '.join(word for word in x.split() if word not in en_stopwords))

In [16]:
df

Unnamed: 0,reviews,lowercase_reviews,remove_stopwords_reviews
0,The room was kind of clean but had a VERY stro...,the room was kind of clean but had a very stro...,room kind clean strong smell dogs. generally a...
1,I stayed at the Crown Plaza April -- - April -...,i stayed at the crown plaza april -- - april -...,"stayed crown plaza april -- - april --, ----. ..."
2,I booked this hotel through Hotwire at the low...,i booked this hotel through hotwire at the low...,booked hotel hotwire lowest price could find. ...
3,Stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...,stayed husband sons way alaska cruise. loved h...
4,My girlfriends and I stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...,girlfriends stayed celebrate --th birthdays. p...
...,...,...,...
38927,We arrived late at night and walked in to a ch...,we arrived late at night and walked in to a ch...,arrived late night walked check-in area comple...
38928,The only positive impression is location and p...,the only positive impression is location and p...,positive impression location public parking op...
38929,Traveling with friends for shopping and a show...,traveling with friends for shopping and a show...,traveling friends shopping show. location grea...
38930,The experience was just ok. We paid extra for ...,the experience was just ok. we paid extra for ...,"experience ok. paid extra view pool, got view ..."


### Remove Punctuations

In [17]:
# remove punctuations
df["remove_punc_reviews"] = df["remove_stopwords_reviews"].apply(
    lambda x: re.sub(r"[^\w\s]", "", x)) 

In [18]:
df

Unnamed: 0,reviews,lowercase_reviews,remove_stopwords_reviews,remove_punc_reviews
0,The room was kind of clean but had a VERY stro...,the room was kind of clean but had a very stro...,room kind clean strong smell dogs. generally a...,room kind clean strong smell dogs generally av...
1,I stayed at the Crown Plaza April -- - April -...,i stayed at the crown plaza april -- - april -...,"stayed crown plaza april -- - april --, ----. ...",stayed crown plaza april april staff frien...
2,I booked this hotel through Hotwire at the low...,i booked this hotel through hotwire at the low...,booked hotel hotwire lowest price could find. ...,booked hotel hotwire lowest price could find g...
3,Stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...,stayed husband sons way alaska cruise. loved h...,stayed husband sons way alaska cruise loved ho...
4,My girlfriends and I stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...,girlfriends stayed celebrate --th birthdays. p...,girlfriends stayed celebrate th birthdays plan...
...,...,...,...,...
38927,We arrived late at night and walked in to a ch...,we arrived late at night and walked in to a ch...,arrived late night walked check-in area comple...,arrived late night walked checkin area complet...
38928,The only positive impression is location and p...,the only positive impression is location and p...,positive impression location public parking op...,positive impression location public parking op...
38929,Traveling with friends for shopping and a show...,traveling with friends for shopping and a show...,traveling friends shopping show. location grea...,traveling friends shopping show location great...
38930,The experience was just ok. We paid extra for ...,the experience was just ok. we paid extra for ...,"experience ok. paid extra view pool, got view ...",experience ok paid extra view pool got view pa...


### Tokenization

In [19]:
# tokenize
df["tokenized_reviews"] = df["remove_punc_reviews"].apply(lambda x: word_tokenize(x))

In [20]:
df

Unnamed: 0,reviews,lowercase_reviews,remove_stopwords_reviews,remove_punc_reviews,tokenized_reviews
0,The room was kind of clean but had a VERY stro...,the room was kind of clean but had a very stro...,room kind clean strong smell dogs. generally a...,room kind clean strong smell dogs generally av...,"[room, kind, clean, strong, smell, dogs, gener..."
1,I stayed at the Crown Plaza April -- - April -...,i stayed at the crown plaza april -- - april -...,"stayed crown plaza april -- - april --, ----. ...",stayed crown plaza april april staff frien...,"[stayed, crown, plaza, april, april, staff, fr..."
2,I booked this hotel through Hotwire at the low...,i booked this hotel through hotwire at the low...,booked hotel hotwire lowest price could find. ...,booked hotel hotwire lowest price could find g...,"[booked, hotel, hotwire, lowest, price, could,..."
3,Stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...,stayed husband sons way alaska cruise. loved h...,stayed husband sons way alaska cruise loved ho...,"[stayed, husband, sons, way, alaska, cruise, l..."
4,My girlfriends and I stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...,girlfriends stayed celebrate --th birthdays. p...,girlfriends stayed celebrate th birthdays plan...,"[girlfriends, stayed, celebrate, th, birthdays..."
...,...,...,...,...,...
38927,We arrived late at night and walked in to a ch...,we arrived late at night and walked in to a ch...,arrived late night walked check-in area comple...,arrived late night walked checkin area complet...,"[arrived, late, night, walked, checkin, area, ..."
38928,The only positive impression is location and p...,the only positive impression is location and p...,positive impression location public parking op...,positive impression location public parking op...,"[positive, impression, location, public, parki..."
38929,Traveling with friends for shopping and a show...,traveling with friends for shopping and a show...,traveling friends shopping show. location grea...,traveling friends shopping show location great...,"[traveling, friends, shopping, show, location,..."
38930,The experience was just ok. We paid extra for ...,the experience was just ok. we paid extra for ...,"experience ok. paid extra view pool, got view ...",experience ok paid extra view pool got view pa...,"[experience, ok, paid, extra, view, pool, got,..."


### Stemming

In [21]:
# stemming
pps = PorterStemmer()

df["stemmed_reviews"] = df["tokenized_reviews"].apply(lambda x: [pps.stem(word) for word in x])

In [22]:
df

Unnamed: 0,reviews,lowercase_reviews,remove_stopwords_reviews,remove_punc_reviews,tokenized_reviews,stemmed_reviews
0,The room was kind of clean but had a VERY stro...,the room was kind of clean but had a very stro...,room kind clean strong smell dogs. generally a...,room kind clean strong smell dogs generally av...,"[room, kind, clean, strong, smell, dogs, gener...","[room, kind, clean, strong, smell, dog, gener,..."
1,I stayed at the Crown Plaza April -- - April -...,i stayed at the crown plaza april -- - april -...,"stayed crown plaza april -- - april --, ----. ...",stayed crown plaza april april staff frien...,"[stayed, crown, plaza, april, april, staff, fr...","[stay, crown, plaza, april, april, staff, frie..."
2,I booked this hotel through Hotwire at the low...,i booked this hotel through hotwire at the low...,booked hotel hotwire lowest price could find. ...,booked hotel hotwire lowest price could find g...,"[booked, hotel, hotwire, lowest, price, could,...","[book, hotel, hotwir, lowest, price, could, fi..."
3,Stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...,stayed husband sons way alaska cruise. loved h...,stayed husband sons way alaska cruise loved ho...,"[stayed, husband, sons, way, alaska, cruise, l...","[stay, husband, son, way, alaska, cruis, love,..."
4,My girlfriends and I stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...,girlfriends stayed celebrate --th birthdays. p...,girlfriends stayed celebrate th birthdays plan...,"[girlfriends, stayed, celebrate, th, birthdays...","[girlfriend, stay, celebr, th, birthday, plan,..."
...,...,...,...,...,...,...
38927,We arrived late at night and walked in to a ch...,we arrived late at night and walked in to a ch...,arrived late night walked check-in area comple...,arrived late night walked checkin area complet...,"[arrived, late, night, walked, checkin, area, ...","[arriv, late, night, walk, checkin, area, comp..."
38928,The only positive impression is location and p...,the only positive impression is location and p...,positive impression location public parking op...,positive impression location public parking op...,"[positive, impression, location, public, parki...","[posit, impress, locat, public, park, opposit,..."
38929,Traveling with friends for shopping and a show...,traveling with friends for shopping and a show...,traveling friends shopping show. location grea...,traveling friends shopping show location great...,"[traveling, friends, shopping, show, location,...","[travel, friend, shop, show, locat, great, bes..."
38930,The experience was just ok. We paid extra for ...,the experience was just ok. we paid extra for ...,"experience ok. paid extra view pool, got view ...",experience ok paid extra view pool got view pa...,"[experience, ok, paid, extra, view, pool, got,...","[experi, ok, paid, extra, view, pool, got, vie..."


### Lemmatization

In [23]:
# lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

df["lemmatized_reviews"] = df["stemmed_reviews"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /home/priyesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/priyesh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [24]:
df

Unnamed: 0,reviews,lowercase_reviews,remove_stopwords_reviews,remove_punc_reviews,tokenized_reviews,stemmed_reviews,lemmatized_reviews
0,The room was kind of clean but had a VERY stro...,the room was kind of clean but had a very stro...,room kind clean strong smell dogs. generally a...,room kind clean strong smell dogs generally av...,"[room, kind, clean, strong, smell, dogs, gener...","[room, kind, clean, strong, smell, dog, gener,...","[room, kind, clean, strong, smell, dog, gener,..."
1,I stayed at the Crown Plaza April -- - April -...,i stayed at the crown plaza april -- - april -...,"stayed crown plaza april -- - april --, ----. ...",stayed crown plaza april april staff frien...,"[stayed, crown, plaza, april, april, staff, fr...","[stay, crown, plaza, april, april, staff, frie...","[stay, crown, plaza, april, april, staff, frie..."
2,I booked this hotel through Hotwire at the low...,i booked this hotel through hotwire at the low...,booked hotel hotwire lowest price could find. ...,booked hotel hotwire lowest price could find g...,"[booked, hotel, hotwire, lowest, price, could,...","[book, hotel, hotwir, lowest, price, could, fi...","[book, hotel, hotwir, lowest, price, could, fi..."
3,Stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...,stayed husband sons way alaska cruise. loved h...,stayed husband sons way alaska cruise loved ho...,"[stayed, husband, sons, way, alaska, cruise, l...","[stay, husband, son, way, alaska, cruis, love,...","[stay, husband, son, way, alaska, cruis, love,..."
4,My girlfriends and I stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...,girlfriends stayed celebrate --th birthdays. p...,girlfriends stayed celebrate th birthdays plan...,"[girlfriends, stayed, celebrate, th, birthdays...","[girlfriend, stay, celebr, th, birthday, plan,...","[girlfriend, stay, celebr, th, birthday, plan,..."
...,...,...,...,...,...,...,...
38927,We arrived late at night and walked in to a ch...,we arrived late at night and walked in to a ch...,arrived late night walked check-in area comple...,arrived late night walked checkin area complet...,"[arrived, late, night, walked, checkin, area, ...","[arriv, late, night, walk, checkin, area, comp...","[arriv, late, night, walk, checkin, area, comp..."
38928,The only positive impression is location and p...,the only positive impression is location and p...,positive impression location public parking op...,positive impression location public parking op...,"[positive, impression, location, public, parki...","[posit, impress, locat, public, park, opposit,...","[posit, impress, locat, public, park, opposit,..."
38929,Traveling with friends for shopping and a show...,traveling with friends for shopping and a show...,traveling friends shopping show. location grea...,traveling friends shopping show location great...,"[traveling, friends, shopping, show, location,...","[travel, friend, shop, show, locat, great, bes...","[travel, friend, shop, show, locat, great, bes..."
38930,The experience was just ok. We paid extra for ...,the experience was just ok. we paid extra for ...,"experience ok. paid extra view pool, got view ...",experience ok paid extra view pool got view pa...,"[experience, ok, paid, extra, view, pool, got,...","[experi, ok, paid, extra, view, pool, got, vie...","[experi, ok, paid, extra, view, pool, got, vie..."


### N-grams

In [25]:
# clean_tokens = sum(df["lemmatized_reviews"], [])
clean_tokens = [token for row in df["lemmatized_reviews"] for token in row]

In [26]:
len(clean_tokens)

3108274

In [27]:
# unigram
unigram = (pd.Series(nltk.ngrams(clean_tokens, 1))).value_counts()

In [28]:
unigram

(room,)          81792
(hotel,)         78785
(stay,)          48124
(great,)         25552
(staff,)         24446
                 ...  
(dismalnot,)         1
(onether,)           1
(johnna,)            1
(inoraround,)        1
(unmusti,)           1
Name: count, Length: 51704, dtype: int64

In [29]:
# bigram
bigram = (pd.Series(nltk.ngrams(clean_tokens, 2))).value_counts()

In [30]:
bigram

(front, desk)             7741
(stay, hotel)             3982
(room, clean)             3946
(stay, again)             3401
(staff, friendli)         3285
                          ... 
(servicegiven, mistak)       1
(view, palm)                 1
(pool, oh)                   1
(lot, dumpster)              1
(you, push)                  1
Name: count, Length: 1017007, dtype: int64

In [31]:
# trigram
trigram = (pd.Series(nltk.ngrams(clean_tokens, 3))).value_counts()

In [32]:
trigram

(within, walk, distanc)     1500
(front, desk, staff)        1330
(staff, friendli, help)     1111
(would, definit, stay)       993
(would, stay, again)         937
                            ... 
(compens, misfunct, one)       1
(misfunct, one, bedroom)       1
(live, room, froze)            1
(room, froze, blanket)         1
(sinc, run, live)              1
Name: count, Length: 2510292, dtype: int64

In [33]:
ngrams_4 = (pd.Series(nltk.ngrams(clean_tokens, 4))).value_counts()

In [34]:
ngrams_4

(would, definit, stay, again)        605
(would, highli, recommend, hotel)    209
(would, recommend, hotel, anyon)     198
(within, easi, walk, distanc)        185
(room, clean, bed, comfort)          160
                                    ... 
(would, probabl, make, star)           1
(probabl, make, star, could)           1
(make, star, could, one)               1
(star, could, one, reason)             1
(room, great, locat, that)             1
Name: count, Length: 2997463, dtype: int64