In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import ast

In [2]:
#Downloading using nltk packages
nltk.download('punkt')
nltk.download('average_perceptron_tagger')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\divit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading average_perceptron_tagger: Package
[nltk_data]     'average_perceptron_tagger' not found in index
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\divit\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Reading the data file
df = pd.read_csv(r'D:/Sentiment Analysis/reviews.csv')

In [4]:
#Converting datatype to string
df['content']=df['content'].astype(str)

**#Lower Case**

In [5]:
df['content']=df['content'].apply(lambda x:" ".join(x.lower() for x in x.split()))

**#Remove Links**

In [6]:
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')

In [7]:
def remove_urls(text):
    return url_pattern.sub(r'',text)

In [8]:
df['content']=df['content'].apply(remove_urls)

**#Remove Next Lines**

In [9]:
df['content']=df['content'].replace('\n',' ',regex=True)

**#Words Containing Numbers**

In [10]:
def remove_num(text):
    return re.sub('\S*\d\S*','',text).strip()

In [11]:
df['content']=df['content'].apply(remove_num)

**#Extra Spaces**

In [12]:
def extra_spaces(text):
    return re.sub(' +',' ',text)

In [13]:
df['content']=df['content'].apply(extra_spaces)

**#Special Characters**

In [14]:
df['content']=df['content'].replace(r'[^A-Za-z0-9]+',' ',regex=True)

**#Removal of Stop Words**

In [15]:
stop_words = set(stopwords.words('english'))
def remove_stop(x):
    return ' '.join([word for word in str(x).split() if word not in stop_words])

In [16]:
df['content']=df['content'].apply(lambda x: remove_stop(x))

**#Stemming**

In [17]:
from nltk.stem.porter import PorterStemmer

In [18]:
stemmer=PorterStemmer()
def stem_words(text):
    word_tokens=text.split()
    stems=' '.join([stemmer.stem(word) for word in word_tokens])
    return stems

In [19]:
df['content']=df['content'].apply(stem_words)

**#Lemmatization**

In [20]:
from nltk.stem import WordNetLemmatizer

In [21]:
lemmatizer=WordNetLemmatizer()
def lemmatize_word(text):
    word_tokens=text.split()
    lemmas=' '.join([lemmatizer.lemmatize(word,pos='v') for word in word_tokens]) #v for verb
    return lemmas

In [22]:
df['content']=df['content'].apply(lemmatize_word)

In [23]:
df.head(10)

Unnamed: 0,reviewId,userName,userImage,content,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId
0,0197c118-5c6f-4a7b-894c-970023d1a350,Mar Zur,https://play-lh.googleusercontent.com/a/ACg8oc...,recur task everi day need list remind buzz tim...,11,4.16.6.2,22-07-2020 13:13,Our team will be happy to look into it for you...,23-07-2020 16:32,4.16.6.2,most_relevant,com.anydo
1,94868fb5-a21d-4ef9-ab85-81b2ed3d0785,Devin Rivera,https://play-lh.googleusercontent.com/a-/ALV-U...,instead shop around download wide use set one ...,8,,08-12-2020 06:24,We are not aware of any issues with randomized...,10-12-2020 09:38,,most_relevant,com.anydo
2,825da34e-f65d-4ef3-991d-02d5291820d6,Heidi Kinsley,https://play-lh.googleusercontent.com/a/ACg8oc...,everi blue app ask updat acct email everyth ta...,6,5.11.1.2,09-07-2021 13:51,Sorry to hear that! It sounds like you might h...,11-07-2021 11:16,5.11.1.2,most_relevant,com.anydo
3,a49c2875-651a-4c33-b79c-5813780d659e,Daniel Keller,https://play-lh.googleusercontent.com/a/ACg8oc...,terribl updat app use perfect plan certain tas...,5,,16-11-2020 01:50,Please note that the tasks in your tasks view ...,17-11-2020 09:31,,most_relevant,com.anydo
4,9482c75e-2e63-46ab-8c94-47273dd6a829,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,app deceivingli terribl realli nice design fea...,20,4.14.0.4,31-01-2019 16:19,"Hi Ryan, it sounds like you are describing our...",05-02-2019 11:52,4.14.0.4,most_relevant,com.anydo
5,6446c87d-e045-4325-b154-042cde863d74,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,smart groceri list hell might handi sort list ...,19,4.12.0.5,17-01-2019 00:09,Please note that you can turn off the smart li...,30-01-2019 15:18,4.12.0.5,most_relevant,com.anydo
6,afe06e54-1014-49fa-80a6-f2b74880ef7c,rainrunner13,https://play-lh.googleusercontent.com/a-/ALV-U...,wunderlist everyth need issu app replac tri ke...,6,,30-04-2020 06:46,"Any.do is a task management app, which has no ...",03-05-2020 10:27,,most_relevant,com.anydo
7,8256ea78-31ef-4eb4-abb0-1411d11a7d9e,Syd Stoll,https://play-lh.googleusercontent.com/a/ACg8oc...,app almost year realli enjoy mostli use grocer...,84,5.15.2.1,07-11-2021 15:58,This could happen if a user pressed on an adve...,08-11-2021 14:36,5.15.2.1,most_relevant,com.anydo
8,775c3273-36cc-4787-a126-06ec46834507,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,almost perfect develop choos ignor one import ...,3,,07-07-2019 15:10,Popups have been proven to increase productivi...,09-07-2019 11:09,,most_relevant,com.anydo
9,9f47e332-2c56-426c-8430-ce8ffde55706,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,annoy keep sign delet list ive work master sho...,29,4.15.8.11,29-09-2019 16:03,"Hi, that's odd, please send us a bug report in...",02-10-2019 19:51,4.15.8.11,most_relevant,com.anydo
