# Imported required libraries

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from wordcloud import WordCloud

## Downloaded NLTK resources

In [8]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loaded The Dataset And Printed 

In [9]:
data = pd.read_csv("reviews.csv")

print(data.head())

                               reviewId       userName  \
0  0197c118-5c6f-4a7b-894c-970023d1a350        Mar Zur   
1  94868fb5-a21d-4ef9-ab85-81b2ed3d0785   Devin Rivera   
2  825da34e-f65d-4ef3-991d-02d5291820d6  Heidi Kinsley   
3  a49c2875-651a-4c33-b79c-5813780d659e  Daniel Keller   
4  9482c75e-2e63-46ab-8c94-47273dd6a829  A Google user   

                                           userImage  \
0  https://play-lh.googleusercontent.com/a/ACg8oc...   
1  https://play-lh.googleusercontent.com/a-/ALV-U...   
2  https://play-lh.googleusercontent.com/a/ACg8oc...   
3  https://play-lh.googleusercontent.com/a/ACg8oc...   
4  https://play-lh.googleusercontent.com/EGemoI2N...   

                                             content  thumbsUpCount  \
0  I have the same recurring tasks to do every da...             11   
1  Instead of shopping around, I downloaded Any.d...              8   
2  Why does every once in a while... out of the b...              6   
3  Terrible Update! This app u

## Converted The Content Into Lower Case

In [13]:
data['content'] = data['content'].str.lower()

##  Removed the Links

In [15]:
data['content'] = data['content'].astype(str).apply(lambda x: re.sub(r'http\S+', '', x))

## Removed next lines (\n) 

In [16]:
data['content'] = data['content'].str.replace('\n', '')

## Removed the words that contains numbers

In [17]:
data['content'] = data['content'].apply(lambda x: re.sub(r'\b\w*\d\w*\b', '', x))

## Removed the extra spaces

In [18]:
data['content'] = data['content'].apply(lambda x: ' '.join(x.split()))

## Removed the special characters

In [20]:
def remove_special_characters(text):
    pattern = r'[^a-zA-Z0-9\s]' 
    return re.sub(pattern, '', text)

data['content'] = data['content'].apply(remove_special_characters)

## Removed of Stop words

In [21]:
from gensim.parsing.preprocessing import remove_stopwords

data['content'] = data['content'].apply(remove_stopwords)

## Stemming

In [25]:
porter = PorterStemmer()

def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [porter.stem(word) for word in words]
    return ' '.join(stemmed_words)

data['content'] = data['content'].apply(stem_text)

## Lemmatization

In [26]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['content'] = data['content'].apply(lemmatize_text)

## Printed the "content" data after implementing data preprocessing techniques

In [28]:
print(data['content'])

0        recur task day need todolist remind buzz time ...
1        instead shop download anydo wide set day sched...
2        blue app ask updat acct email task list lost t...
3        terribl updat app perfect plan certain task co...
4        app deceivingli terribl nice design featur lik...
                               ...                        
16782                                            excel app
16783    love easi use life organ love way photo locat ...
16784                    love plan check app everyday love
16785                                         exactli need
16786                                                 good
Name: content, Length: 16787, dtype: object
