# Importing the required modules

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

# Getting the file

In [2]:
df = pd.read_csv(r'F:\anu\infosys_internship\reviews.csv')

df.head(2)

Unnamed: 0,reviewId,userName,userImage,content,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId
0,0197c118-5c6f-4a7b-894c-970023d1a350,Mar Zur,https://play-lh.googleusercontent.com/a/ACg8oc...,I have the same recurring tasks to do every da...,11,4.16.6.2,22-07-2020 13:13,Our team will be happy to look into it for you...,23-07-2020 16:32,4.16.6.2,most_relevant,com.anydo
1,94868fb5-a21d-4ef9-ab85-81b2ed3d0785,Devin Rivera,https://play-lh.googleusercontent.com/a-/ALV-U...,"Instead of shopping around, I downloaded Any.d...",8,,08-12-2020 06:24,We are not aware of any issues with randomized...,10-12-2020 09:38,,most_relevant,com.anydo


In [3]:
print(df.shape)

print(df.info())

(16787, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16787 entries, 0 to 16786
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              16787 non-null  object
 1   userName              16787 non-null  object
 2   userImage             16787 non-null  object
 3   content               16787 non-null  object
 4   thumbsUpCount         16787 non-null  int64 
 5   reviewCreatedVersion  14430 non-null  object
 6   at                    16787 non-null  object
 7   replyContent          9168 non-null   object
 8   repliedAt             9168 non-null   object
 9   appVersion            14430 non-null  object
 10  sortOrder             16787 non-null  object
 11  appId                 16787 non-null  object
dtypes: int64(1), object(11)
memory usage: 1.5+ MB
None


# Converting into lower case

In [4]:
df['content']=df['content'].str.lower()

# Removing links

In [5]:
df['content'] = df['content'].fillna('')

df['content'] = df['content'].apply(lambda x: re.sub(r"http\S+", "", x))

# Remove next lines

In [6]:
 df['content'] = df['content'].str.replace('\n',' ')

# Removing words containing numbers

In [7]:
pattern = r'\b\w*\d\w*\b'

def remove_words_containing_numbers(text):
    return re.sub(pattern, '', text)

df['content'] = df['content'].apply(remove_words_containing_numbers)

# Removing extra spaces

In [8]:
df['content'] = df['content'].apply(lambda x: ' '.join(x.split()))

# Removing special characters 

In [9]:
df['content'] = df['content'].apply(lambda x: re.sub('[^A-Za-z0-9]+', '', x))

# Removing stop words

In [10]:
# applying the process of removing the stop words
stop_words = set(stopwords.words('english'))

def removing_stopwords(text):
    tokens = word_tokenize(text)
    text = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(text)

df['content'] = df['content'].apply(removing_stopwords)

# Stemming process

In [11]:
stemmer = PorterStemmer()
# applying the process

def stemming_process(word):
    return stemmer.stem(word)

#calling the function
df['content'] = df['content'].apply(lambda x: ' '.join(stemming_process(word) for word in x.split()))

# Lemmatization

In [12]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
lemmatizer = WordNetLemmatizer()
df['content'] = df['content'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [14]:
df.head(3)

Unnamed: 0,reviewId,userName,userImage,content,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId
0,0197c118-5c6f-4a7b-894c-970023d1a350,Mar Zur,https://play-lh.googleusercontent.com/a/ACg8oc...,ihavethesamerecurringtaskstodoeverydayineedato...,11,4.16.6.2,22-07-2020 13:13,Our team will be happy to look into it for you...,23-07-2020 16:32,4.16.6.2,most_relevant,com.anydo
1,94868fb5-a21d-4ef9-ab85-81b2ed3d0785,Devin Rivera,https://play-lh.googleusercontent.com/a-/ALV-U...,insteadofshoppingaroundidownloadedanydobecause...,8,,08-12-2020 06:24,We are not aware of any issues with randomized...,10-12-2020 09:38,,most_relevant,com.anydo
2,825da34e-f65d-4ef3-991d-02d5291820d6,Heidi Kinsley,https://play-lh.googleusercontent.com/a/ACg8oc...,whydoeseveryonceinawhileoutofthebluetheappasks...,6,5.11.1.2,09-07-2021 13:51,Sorry to hear that! It sounds like you might h...,11-07-2021 11:16,5.11.1.2,most_relevant,com.anydo
