# Data Scraping

We will use requests and BeautifulSoup for data scraping

In [2]:
import requests
from bs4 import BeautifulSoup

In [422]:
'''Import the website for page of any business or service provider from yelp.com here'''
#r = requests.get("")
#making request to get the data



'''Samples'''
#r = requests.get("https://www.yelp.com/biz/current-solutions-electric-kansas-city?override_cta=Request+a+Quote")
#r = requests.get("https://www.yelp.com/biz/topgolf-overland-park-2")
#r = requests.get("https://www.yelp.com/biz/the-ceiling-fan-man-electrical-service-edgerton?override_cta=Request+a+Quote")

r = requests.get("https://www.yelp.com/biz/cicis-pizza-liberty-4")

In [423]:
r.status_code #checking the status of our request. Status of 200 means the request was successful

200

In [1]:
##But it doesn't give text to read the data. So, converting the requested data to text
r.text

NameError: name 'r' is not defined

In [425]:
#now parsing through the html page

soup = BeautifulSoup(r.text, 'html.parser')

In [426]:
#lets find the class of the section with reviews in html elements
divs = soup.find_all(class_= "comment__09f24__D0cxf css-qgunke")


In [427]:
#lets look at the filtered html elements

for div in divs:
   print(div.prettify())

In [428]:
#Lets only get the texts of reviews
for div in divs:
    print(div.text, '\n')

In [429]:
#creating an array of reviews
reviews = []
for div in divs:
    reviews.append(div.text)

In [430]:
reviews

["It's a thursday afternoon and this Cici's is clean, well-stocked; and being run solo by one lady. The country is a little loud but hey, it's missouri. Got lunch and got out and had a chill time doing it.Edit: For Management / Owners: The women's restroom needs some TLC in order for it to feel like a safe, comfortable space. The seats should be replaced and one of the walls needs to be re-affixed to the door before it hurts someone, there are screws sticking out and easily accessible in a space frequented by children.",
 'What a joke of a business surprise to see them still in business every time you go there they closing early and pizza lame',
 'love this place, i totally recommend to all pizza lovers i had to come here for my school trip but i still love it',
 "Everyone knows that this place has subpar pizza but sometimes as parents we look past that because of the cheap price.  My biggest gripes are that they don't take responsibility for everything in their building.  Something as

Lets Start calculating metrices

.

.

.

.

.

.

.

.

.

.

.

# Data Analysis with key metrices

First, lets get this data in a dataframe

In [431]:
import pandas as pd
import numpy as np

In [432]:
##converting the array into a numpy array (we can skip this if we want to but numpy arrays consume less memory and convenient to use) 
# and then into a dataframe

df = pd.DataFrame(np.array(reviews), columns = ['reviews'])
df

Unnamed: 0,reviews
0,It's a thursday afternoon and this Cici's is c...
1,What a joke of a business surprise to see them...
2,"love this place, i totally recommend to all pi..."
3,Everyone knows that this place has subpar pizz...
4,I have not been to a cc pizza since I was in c...
5,I am so impressed with the management of this ...
6,"Rediculous, spend $9 on buffet without drink a..."
7,Way overpriced for a very small buffet with me...
8,Yummy pizza. This location is new and clean. T...
9,This place is discussing and nothing it clean ...


In [433]:
#looking at how many reiews are there
len(df)

10

### 1st metric we will use is word count of a review

In [434]:
#lets find the word count to get a better gaze at how much detail there might be in a review
df["reviews"].apply(lambda x: len(x.split()))

0     95
1     24
2     24
3    222
4    102
5    131
6     35
7     48
8     33
9     53
Name: reviews, dtype: int64

In [435]:
#Lets add the word count column in the dataframe
df["word_count"] = df["reviews"].apply(lambda x: len(x.split()))
df

Unnamed: 0,reviews,word_count
0,It's a thursday afternoon and this Cici's is c...,95
1,What a joke of a business surprise to see them...,24
2,"love this place, i totally recommend to all pi...",24
3,Everyone knows that this place has subpar pizz...,222
4,I have not been to a cc pizza since I was in c...,102
5,I am so impressed with the management of this ...,131
6,"Rediculous, spend $9 on buffet without drink a...",35
7,Way overpriced for a very small buffet with me...,48
8,Yummy pizza. This location is new and clean. T...,33
9,This place is discussing and nothing it clean ...,53


### 2nd metric is character count of review

In [436]:
#using lambda function to count the number of characters in each row of reviews column
df["char_count"] = df["reviews"].apply(lambda x: len(x))
df

Unnamed: 0,reviews,word_count,char_count
0,It's a thursday afternoon and this Cici's is c...,95,522
1,What a joke of a business surprise to see them...,24,122
2,"love this place, i totally recommend to all pi...",24,114
3,Everyone knows that this place has subpar pizz...,222,1203
4,I have not been to a cc pizza since I was in c...,102,535
5,I am so impressed with the management of this ...,131,673
6,"Rediculous, spend $9 on buffet without drink a...",35,191
7,Way overpriced for a very small buffet with me...,48,282
8,Yummy pizza. This location is new and clean. T...,33,187
9,This place is discussing and nothing it clean ...,53,283


### Average length of each word in a review

In [437]:
#creating an average word function
def average_words(x):
    words = x.split() #splitting the words 
    return sum(len(word) for word in words) / len(words) #what it does is add the length of each word divided by the total number of words

#it took me sometime as why we need this function? why can't we just do char_count / word_count. If we do that, 
# we will also include " " in between words which is included in char_count

In [438]:
df["average_word_length"] = df["reviews"].apply(lambda x: average_words(x))

### 3rd metric is number of stop words

Stop words are words which add liitle to no meaning in review. Examples I, us, are etc. 
We need to strip these out later.

For this, we will import a stop word list using a Natural Language Toolkit

In [439]:
##I didn't have nltk in my environment. So, I installed it in the anaconda environment from termial


In [440]:
import nltk
nltk.download('stopwords')
#I had to download the stopwords  as txt file because I got error of size limit.

[nltk_data] Downloading package stopwords to /Users/supg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [441]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english') #stopwords for english
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [442]:
len(stop_words)

179

In [443]:
df['stopword_count'] = df['reviews'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))
#what we did is applied lambda function to get the length of words (lower cased) in the reviews who exist in the stop_words
df

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46
1,What a joke of a business surprise to see them...,24,122,4.125,11
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56
5,I am so impressed with the management of this ...,131,673,4.145038,70
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12
9,This place is discussing and nothing it clean ...,53,283,4.358491,26


### 4th metric is % of stop word

In [444]:
df['stopword_rate'] = df['stopword_count'] / df['word_count'] #calculating percentage (in decimal)
df

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566


In [445]:
df.sort_values(by = 'stopword_rate') #sorting by the stopword rate

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902


In [446]:
df.describe() #looking at statistical parameters

Unnamed: 0,word_count,char_count,average_word_length,stopword_count,stopword_rate
count,10.0,10.0,10.0,10.0,10.0
mean,76.7,411.2,4.359517,38.3,0.471607
std,62.914492,337.79277,0.310324,34.660416,0.071522
min,24.0,114.0,3.791667,11.0,0.363636
25%,33.5,188.0,4.172504,13.0,0.411458
50%,50.5,282.5,4.361678,22.5,0.487388
75%,100.25,531.75,4.493233,53.5,0.53252
max,222.0,1203.0,4.895833,117.0,0.54902


.

.

.

.

.

.

.

.

.

.

# Data Cleaning/ Prepping

### Lowercasing words

In [447]:
df['reviews'].apply(lambda x: " ".join(word.lower() for word in x.split()))

#what we did is we first split the words in reviews, then lowered them and then joined them back with " " in between to get the whole sentences

0    it's a thursday afternoon and this cici's is c...
1    what a joke of a business surprise to see them...
2    love this place, i totally recommend to all pi...
3    everyone knows that this place has subpar pizz...
4    i have not been to a cc pizza since i was in c...
5    i am so impressed with the management of this ...
6    rediculous, spend $9 on buffet without drink a...
7    way overpriced for a very small buffet with me...
8    yummy pizza. this location is new and clean. t...
9    this place is discussing and nothing it clean ...
Name: reviews, dtype: object

In [448]:
#lets save the above change into a new column
df['lowercase'] = df['reviews'].apply(lambda x: " ".join(word.lower() for word in x.split()))
df

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,it's a thursday afternoon and this cici's is c...
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,what a joke of a business surprise to see them...
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,"love this place, i totally recommend to all pi..."
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone knows that this place has subpar pizz...
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,i have not been to a cc pizza since i was in c...
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,i am so impressed with the management of this ...
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,"rediculous, spend $9 on buffet without drink a..."
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced for a very small buffet with me...
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza. this location is new and clean. t...
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,this place is discussing and nothing it clean ...


### Removing punctuations
we dont need them to find sentiments

In [449]:
df['punctuation'] = df['lowercase'].str.replace('[^\w\s]', '')
df

#The regular expression [^\w\s] is a character class that matches a single character that is neither a word character (
# alphanumeric or underscore) nor a whitespace character.

# [ ]: Denotes a character class, meaning "match any one of the characters inside the brackets."
# ^: When used as the first character inside a character class, it negates the character class. 
# So, [^...] means "match any character that is not in the list."
# \w: Represents a word character (alphanumeric or underscore).
# \s: Represents a whitespace character (space, tab, newline).

  df['punctuation'] = df['lowercase'].str.replace('[^\w\s]', '')


Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,it's a thursday afternoon and this cici's is c...,its a thursday afternoon and this cicis is cle...
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,what a joke of a business surprise to see them...,what a joke of a business surprise to see them...
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,"love this place, i totally recommend to all pi...",love this place i totally recommend to all piz...
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone knows that this place has subpar pizz...,everyone knows that this place has subpar pizz...
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,i have not been to a cc pizza since i was in c...,i have not been to a cc pizza since i was in c...
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,i am so impressed with the management of this ...,i am so impressed with the management of this ...
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,"rediculous, spend $9 on buffet without drink a...",rediculous spend 9 on buffet without drink and...
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced for a very small buffet with me...,way overpriced for a very small buffet with me...
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza. this location is new and clean. t...,yummy pizza this location is new and clean the...
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,this place is discussing and nothing it clean ...,this place is discussing and nothing it clean ...


### Removing stop words

In [450]:
df["no_stopwords"] = df['punctuation'].apply(lambda x: ' '.join (word for word in x.split() if word not in stop_words))
df
#first split the words
##then check if the word is in stop_words
#if the word is not a stop word, join them back

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,no_stopwords
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,it's a thursday afternoon and this cici's is c...,its a thursday afternoon and this cicis is cle...,thursday afternoon cicis clean wellstocked run...
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,what a joke of a business surprise to see them...,what a joke of a business surprise to see them...,joke business surprise see still business ever...
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,"love this place, i totally recommend to all pi...",love this place i totally recommend to all piz...,love place totally recommend pizza lovers come...
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone knows that this place has subpar pizz...,everyone knows that this place has subpar pizz...,everyone knows place subpar pizza sometimes pa...
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,i have not been to a cc pizza since i was in c...,i have not been to a cc pizza since i was in c...,cc pizza since college springfield missouri tw...
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,i am so impressed with the management of this ...,i am so impressed with the management of this ...,impressed management shop greeted walked paid ...
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,"rediculous, spend $9 on buffet without drink a...",rediculous spend 9 on buffet without drink and...,rediculous spend 9 buffet without drink cant g...
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced for a very small buffet with me...,way overpriced for a very small buffet with me...,way overpriced small buffet mediocre pizza pas...
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza. this location is new and clean. t...,yummy pizza this location is new and clean the...,yummy pizza location new clean let request piz...
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,this place is discussing and nothing it clean ...,this place is discussing and nothing it clean ...,place discussing nothing clean dont wash trys ...


In [451]:
#lets look at the number of words in the updated reviews

df['no_stopwords'].apply(lambda x: len(x.split()))


0     48
1     13
2     11
3    106
4     45
5     59
6     21
7     28
8     20
9     28
Name: no_stopwords, dtype: int64

### Removing more manual words

--this step shall be ignored if used for live data

we will remove more words that add no value to the sentiment manually

In [452]:
#Lets join all the reviews into a one giant sentence and convert that into an array
word_array = " ".join(df["no_stopwords"]).split()
word_array

#This is the array of every single word in our reviews

['thursday',
 'afternoon',
 'cicis',
 'clean',
 'wellstocked',
 'run',
 'solo',
 'one',
 'lady',
 'country',
 'little',
 'loud',
 'hey',
 'missouri',
 'got',
 'lunch',
 'got',
 'chill',
 'time',
 'itedit',
 'management',
 'owners',
 'womens',
 'restroom',
 'needs',
 'tlc',
 'order',
 'feel',
 'like',
 'safe',
 'comfortable',
 'space',
 'seats',
 'replaced',
 'one',
 'walls',
 'needs',
 'reaffixed',
 'door',
 'hurts',
 'someone',
 'screws',
 'sticking',
 'easily',
 'accessible',
 'space',
 'frequented',
 'children',
 'joke',
 'business',
 'surprise',
 'see',
 'still',
 'business',
 'every',
 'time',
 'go',
 'closing',
 'early',
 'pizza',
 'lame',
 'love',
 'place',
 'totally',
 'recommend',
 'pizza',
 'lovers',
 'come',
 'school',
 'trip',
 'still',
 'love',
 'everyone',
 'knows',
 'place',
 'subpar',
 'pizza',
 'sometimes',
 'parents',
 'look',
 'past',
 'cheap',
 'price',
 'biggest',
 'gripes',
 'dont',
 'take',
 'responsibility',
 'everything',
 'building',
 'something',
 'simple',
 

In [453]:
#creating a frequency count of strings
pd.Series(word_array).value_counts()[:30]


#this gives us count of how many times a word is occuring in all the reviews
#looking at the top 30 words

pizza        12
would         7
video         6
games         5
clean         5
one           5
place         4
dont          4
salad         4
said          3
like          3
buy           3
give          3
take          3
manager       3
business      3
tokens        3
since         3
time          3
buffet        3
got           3
refund        3
run           3
really        3
still         2
hot           2
customers     2
fresh         2
work          2
price         2
dtype: int64

In [454]:
#lets create a list of words that are repeated but not adding value to the sentiment. This is very subjective
other_stop_words = []
'''for special situations, we may ignore our manually chosen words'''

'for special situations, we may ignore our manually chosen words'

In [455]:
#removing the words in the list above from review sentences

df['clean_review'] = df['no_stopwords'].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))
df

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,no_stopwords,clean_review
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,it's a thursday afternoon and this cici's is c...,its a thursday afternoon and this cicis is cle...,thursday afternoon cicis clean wellstocked run...,thursday afternoon cicis clean wellstocked run...
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,what a joke of a business surprise to see them...,what a joke of a business surprise to see them...,joke business surprise see still business ever...,joke business surprise see still business ever...
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,"love this place, i totally recommend to all pi...",love this place i totally recommend to all piz...,love place totally recommend pizza lovers come...,love place totally recommend pizza lovers come...
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone knows that this place has subpar pizz...,everyone knows that this place has subpar pizz...,everyone knows place subpar pizza sometimes pa...,everyone knows place subpar pizza sometimes pa...
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,i have not been to a cc pizza since i was in c...,i have not been to a cc pizza since i was in c...,cc pizza since college springfield missouri tw...,cc pizza since college springfield missouri tw...
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,i am so impressed with the management of this ...,i am so impressed with the management of this ...,impressed management shop greeted walked paid ...,impressed management shop greeted walked paid ...
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,"rediculous, spend $9 on buffet without drink a...",rediculous spend 9 on buffet without drink and...,rediculous spend 9 buffet without drink cant g...,rediculous spend 9 buffet without drink cant g...
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced for a very small buffet with me...,way overpriced for a very small buffet with me...,way overpriced small buffet mediocre pizza pas...,way overpriced small buffet mediocre pizza pas...
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza. this location is new and clean. t...,yummy pizza this location is new and clean the...,yummy pizza location new clean let request piz...,yummy pizza location new clean let request piz...
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,this place is discussing and nothing it clean ...,this place is discussing and nothing it clean ...,place discussing nothing clean dont wash trys ...,place discussing nothing clean dont wash trys ...


In [456]:
 #lets look at the number of words in the updated reviews

df['no_stopwords'].apply(lambda x: len(x.split()))

0     48
1     13
2     11
3    106
4     45
5     59
6     21
7     28
8     20
9     28
Name: no_stopwords, dtype: int64

at this point the data is ready

.


.

.

.

.

.

.

.

.

.


# Lemmatization


Lemmatization is a natural language processing (NLP) technique that involves reducing words to their base or root form. 
The base form is called a "lemma."

Lemmatization is often used in text processing and analysis to normalize words. It helps in reducing the dimensionality of the feature space and can improve the performance of tasks like text classification, sentiment analysis, and information retrieval.

In contrast to lemmatization, stemming is another technique that involves reducing words to their root or base form, but it may not always result in a valid word. Lemmatization tends to be more linguistically accurate as it considers the context and aims to produce valid words.

"running" → lemma: "run"
"better" → lemma: "good"
"mice" → lemma: "mouse"


We will use textblob to lemmatize

In [457]:
#again I didn't have the textblob package in my environment so had to install it from conda
from textblob import Word

TextBlob is a Python library for processing textual data. It provides a simple API for common natural language processing (NLP) tasks, such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. TextBlob is built on top of NLTK (Natural Language Toolkit) and Pattern libraries.

In [458]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/supg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [459]:
#we need to loop through each word and lemmatize them
df['lemmatized'] = df['clean_review'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))
df

#first we need to change each word into textblob word object

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,no_stopwords,clean_review,lemmatized
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,it's a thursday afternoon and this cici's is c...,its a thursday afternoon and this cicis is cle...,thursday afternoon cicis clean wellstocked run...,thursday afternoon cicis clean wellstocked run...,thursday afternoon cicis clean wellstocked run...
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,what a joke of a business surprise to see them...,what a joke of a business surprise to see them...,joke business surprise see still business ever...,joke business surprise see still business ever...,joke business surprise see still business ever...
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,"love this place, i totally recommend to all pi...",love this place i totally recommend to all piz...,love place totally recommend pizza lovers come...,love place totally recommend pizza lovers come...,love place totally recommend pizza lover come ...
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone knows that this place has subpar pizz...,everyone knows that this place has subpar pizz...,everyone knows place subpar pizza sometimes pa...,everyone knows place subpar pizza sometimes pa...,everyone know place subpar pizza sometimes par...
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,i have not been to a cc pizza since i was in c...,i have not been to a cc pizza since i was in c...,cc pizza since college springfield missouri tw...,cc pizza since college springfield missouri tw...,cc pizza since college springfield missouri tw...
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,i am so impressed with the management of this ...,i am so impressed with the management of this ...,impressed management shop greeted walked paid ...,impressed management shop greeted walked paid ...,impressed management shop greeted walked paid ...
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,"rediculous, spend $9 on buffet without drink a...",rediculous spend 9 on buffet without drink and...,rediculous spend 9 buffet without drink cant g...,rediculous spend 9 buffet without drink cant g...,rediculous spend 9 buffet without drink cant g...
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced for a very small buffet with me...,way overpriced for a very small buffet with me...,way overpriced small buffet mediocre pizza pas...,way overpriced small buffet mediocre pizza pas...,way overpriced small buffet mediocre pizza pas...
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza. this location is new and clean. t...,yummy pizza this location is new and clean the...,yummy pizza location new clean let request piz...,yummy pizza location new clean let request piz...,yummy pizza location new clean let request piz...
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,this place is discussing and nothing it clean ...,this place is discussing and nothing it clean ...,place discussing nothing clean dont wash trys ...,place discussing nothing clean dont wash trys ...,place discussing nothing clean dont wash try c...


In [460]:
df.drop(['lowercase','punctuation','no_stopwords','clean_review'], axis = 1, inplace = True)

Now its ready for sentiment analysis

In [461]:
df

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lemmatized
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,thursday afternoon cicis clean wellstocked run...
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,joke business surprise see still business ever...
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,love place totally recommend pizza lover come ...
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone know place subpar pizza sometimes par...
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,cc pizza since college springfield missouri tw...
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,impressed management shop greeted walked paid ...
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,rediculous spend 9 buffet without drink cant g...
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced small buffet mediocre pizza pas...
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza location new clean let request piz...
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,place discussing nothing clean dont wash try c...


.


.

.

.

.

.

.

.

.

# Sentiment Analysis

We will use TextBlob package from textblob to calculate sentiment analysis

TextBlob gives use 2 metrics:
1) Polarity : is a numerical value indicating the sentiment of the text. It ranges from -1 to 1, where -1 represents a negative sentiment, 1 represents a positive sentiment, and 0 represents a neutral sentiment.

2) Subjectivity: Subjectivity is a numerical value indicating the subjectiveness of the text. It ranges from 0 to 1, where 0 is very objective (factual) and 1 is very subjective (opinionated or emotional).

In [462]:
from textblob import TextBlob

In [463]:
df['lemmatized'].apply(lambda x: TextBlob(x).sentiment)
#the first value is polarity and later one is subjectivity

0       (0.2590277777777778, 0.6124999999999999)
1                                  (-0.2, 0.525)
2                     (0.3333333333333333, 0.65)
3    (-0.21153846153846154, 0.48131868131868133)
4       (0.4242424242424242, 0.6848484848484848)
5        (0.3070616883116883, 0.586323051948052)
6                     (-0.4, 0.8500000000000001)
7      (0.12916666666666665, 0.6041666666666666)
8       (0.2606060606060606, 0.5109090909090909)
9                   (-0.08666666666666666, 0.48)
Name: lemmatized, dtype: object

In [464]:
df['polarity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[0])
df

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lemmatized,polarity
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,thursday afternoon cicis clean wellstocked run...,0.259028
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,joke business surprise see still business ever...,-0.2
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,love place totally recommend pizza lover come ...,0.333333
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone know place subpar pizza sometimes par...,-0.211538
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,cc pizza since college springfield missouri tw...,0.424242
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,impressed management shop greeted walked paid ...,0.307062
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,rediculous spend 9 buffet without drink cant g...,-0.4
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced small buffet mediocre pizza pas...,0.129167
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza location new clean let request piz...,0.260606
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,place discussing nothing clean dont wash try c...,-0.086667


In [465]:
df['subjectivity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[1])
df

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lemmatized,polarity,subjectivity
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,thursday afternoon cicis clean wellstocked run...,0.259028,0.6125
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,joke business surprise see still business ever...,-0.2,0.525
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,love place totally recommend pizza lover come ...,0.333333,0.65
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone know place subpar pizza sometimes par...,-0.211538,0.481319
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,cc pizza since college springfield missouri tw...,0.424242,0.684848
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,impressed management shop greeted walked paid ...,0.307062,0.586323
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,rediculous spend 9 buffet without drink cant g...,-0.4,0.85
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced small buffet mediocre pizza pas...,0.129167,0.604167
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza location new clean let request piz...,0.260606,0.510909
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,place discussing nothing clean dont wash try c...,-0.086667,0.48


In [466]:
df.sort_values(by = 'polarity', ascending = False)

Unnamed: 0,reviews,word_count,char_count,average_word_length,stopword_count,stopword_rate,lemmatized,polarity,subjectivity
4,I have not been to a cc pizza since I was in c...,102,535,4.254902,56,0.54902,cc pizza since college springfield missouri tw...,0.424242,0.684848
2,"love this place, i totally recommend to all pi...",24,114,3.791667,13,0.541667,love place totally recommend pizza lover come ...,0.333333,0.65
5,I am so impressed with the management of this ...,131,673,4.145038,70,0.534351,impressed management shop greeted walked paid ...,0.307062,0.586323
8,Yummy pizza. This location is new and clean. T...,33,187,4.69697,12,0.363636,yummy pizza location new clean let request piz...,0.260606,0.510909
0,It's a thursday afternoon and this Cici's is c...,95,522,4.505263,46,0.484211,thursday afternoon cicis clean wellstocked run...,0.259028,0.6125
7,Way overpriced for a very small buffet with me...,48,282,4.895833,19,0.395833,way overpriced small buffet mediocre pizza pas...,0.129167,0.604167
9,This place is discussing and nothing it clean ...,53,283,4.358491,26,0.490566,place discussing nothing clean dont wash try c...,-0.086667,0.48
1,What a joke of a business surprise to see them...,24,122,4.125,11,0.458333,joke business surprise see still business ever...,-0.2,0.525
3,Everyone knows that this place has subpar pizz...,222,1203,4.364865,117,0.527027,everyone know place subpar pizza sometimes par...,-0.211538,0.481319
6,"Rediculous, spend $9 on buffet without drink a...",35,191,4.457143,13,0.371429,rediculous spend 9 buffet without drink cant g...,-0.4,0.85


# Based on this, we can identify sentiments of reviews