In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Let's import the dataset

In [3]:
reviews = pd.read_csv('reviews.csv')
reviews.head()

Unnamed: 0,reviewId,userName,userImage,content,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId
0,0197c118-5c6f-4a7b-894c-970023d1a350,Mar Zur,https://play-lh.googleusercontent.com/a/ACg8oc...,I have the same recurring tasks to do every da...,11,4.16.6.2,22-07-2020 13:13,Our team will be happy to look into it for you...,23-07-2020 16:32,4.16.6.2,most_relevant,com.anydo
1,94868fb5-a21d-4ef9-ab85-81b2ed3d0785,Devin Rivera,https://play-lh.googleusercontent.com/a-/ALV-U...,"Instead of shopping around, I downloaded Any.d...",8,,08-12-2020 06:24,We are not aware of any issues with randomized...,10-12-2020 09:38,,most_relevant,com.anydo
2,825da34e-f65d-4ef3-991d-02d5291820d6,Heidi Kinsley,https://play-lh.googleusercontent.com/a/ACg8oc...,Why does every once in a while... out of the b...,6,5.11.1.2,09-07-2021 13:51,Sorry to hear that! It sounds like you might h...,11-07-2021 11:16,5.11.1.2,most_relevant,com.anydo
3,a49c2875-651a-4c33-b79c-5813780d659e,Daniel Keller,https://play-lh.googleusercontent.com/a/ACg8oc...,Terrible Update! This app used to be perfect f...,5,,16-11-2020 01:50,Please note that the tasks in your tasks view ...,17-11-2020 09:31,,most_relevant,com.anydo
4,9482c75e-2e63-46ab-8c94-47273dd6a829,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,This app is deceivingly terrible. There are so...,20,4.14.0.4,31-01-2019 16:19,"Hi Ryan, it sounds like you are describing our...",05-02-2019 11:52,4.14.0.4,most_relevant,com.anydo


### Overview of the dataset

In [4]:
reviews = reviews.dropna(subset=['content'], how='all')
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16786 entries, 0 to 16786
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              16786 non-null  object
 1   userName              16786 non-null  object
 2   userImage             16786 non-null  object
 3   content               16786 non-null  object
 4   thumbsUpCount         16786 non-null  int64 
 5   reviewCreatedVersion  14429 non-null  object
 6   at                    16786 non-null  object
 7   replyContent          9167 non-null   object
 8   repliedAt             9167 non-null   object
 9   appVersion            14429 non-null  object
 10  sortOrder             16786 non-null  object
 11  appId                 16786 non-null  object
dtypes: int64(1), object(11)
memory usage: 1.7+ MB


## Starting the process pipepline for the Preprocessing of the data

### 1. Lowercasing

In [5]:
contents = reviews['content'].str.lower()
contents.head()

0    i have the same recurring tasks to do every da...
1    instead of shopping around, i downloaded any.d...
2    why does every once in a while... out of the b...
3    terrible update! this app used to be perfect f...
4    this app is deceivingly terrible. there are so...
Name: content, dtype: object

### 2. Removing Hyperlinks

In [6]:
import re
def remove_hyperlinks(text):
    # Define regex pattern to match any kind of hyperlink
    pattern = r'\bhttps?://\S+\b|\bwww\.\S+\b'
    return re.sub(pattern, '', text)

In [7]:
contents = contents.apply(remove_hyperlinks)

### 3. Removing Nextline character ('\n')

In [8]:
contents = contents.replace('\n', ' ', regex=True)

### 4. Removing words containing numbers

In [9]:
def remove_words_with_numbers(text):
    return re.sub(r'\b\w*\d\w*\b', '', text)

In [10]:
contents = contents.apply(remove_words_with_numbers)

### 5. Removing extra spaces

In [11]:
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text)

In [12]:
contents = contents.apply(remove_extra_spaces)

### 6. Removing special characters

In [13]:
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

In [14]:
contents = contents.apply(remove_special_characters)

## First Part of our pipeline is complete.

##### But there are many redundant words and word formats are there in our data. They will be cleaned at the second part of our pipeline.

##### We need to tokentize the contents in order to proceed with the process.

In [15]:
def create_tokens(text):
    return text.split()

In [16]:
contents = contents.apply(create_tokens)

### 7. Removal of Stop words

In [17]:
from nltk.corpus import stopwords

In [18]:
def remove_stop_words(text):
    return list(word for word in text if word not in stopwords.words('english'))

In [19]:
no_stops = contents.apply(remove_stop_words)

### Stemming

In [20]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [21]:
def stem_text(tokens):
    
    stemmed_tokens = [stemmer.stem(token) for token in tokens]  
    return list(stemmed_tokens)

In [22]:
no_stops = no_stops.apply(stem_text)

### 9. Lemmatization

In [23]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [24]:
def lemmatize_text(words):
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

In [25]:
lemmatized = no_stops.apply(lemmatize_text)

##### Now our pipeline is completed. Let's Rejoin the contents.

In [27]:
def rejoin(words):
    return ' '.join(words)

In [29]:
clean_texts = lemmatized.apply(rejoin)
reviews['clean_content'] = clean_texts
reviews.head()

Unnamed: 0,reviewId,userName,userImage,content,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId,clean_content
0,0197c118-5c6f-4a7b-894c-970023d1a350,Mar Zur,https://play-lh.googleusercontent.com/a/ACg8oc...,I have the same recurring tasks to do every da...,11,4.16.6.2,22-07-2020 13:13,Our team will be happy to look into it for you...,23-07-2020 16:32,4.16.6.2,most_relevant,com.anydo,recur task everi day need todolist remind buzz...
1,94868fb5-a21d-4ef9-ab85-81b2ed3d0785,Devin Rivera,https://play-lh.googleusercontent.com/a-/ALV-U...,"Instead of shopping around, I downloaded Any.d...",8,,08-12-2020 06:24,We are not aware of any issues with randomized...,10-12-2020 09:38,,most_relevant,com.anydo,instead shop around download anydo wide use se...
2,825da34e-f65d-4ef3-991d-02d5291820d6,Heidi Kinsley,https://play-lh.googleusercontent.com/a/ACg8oc...,Why does every once in a while... out of the b...,6,5.11.1.2,09-07-2021 13:51,Sorry to hear that! It sounds like you might h...,11-07-2021 11:16,5.11.1.2,most_relevant,com.anydo,everi blue app ask updat acct email everyth ta...
3,a49c2875-651a-4c33-b79c-5813780d659e,Daniel Keller,https://play-lh.googleusercontent.com/a/ACg8oc...,Terrible Update! This app used to be perfect f...,5,,16-11-2020 01:50,Please note that the tasks in your tasks view ...,17-11-2020 09:31,,most_relevant,com.anydo,terribl updat app use perfect plan certain tas...
4,9482c75e-2e63-46ab-8c94-47273dd6a829,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,This app is deceivingly terrible. There are so...,20,4.14.0.4,31-01-2019 16:19,"Hi Ryan, it sounds like you are describing our...",05-02-2019 11:52,4.14.0.4,most_relevant,com.anydo,app deceivingli terribl realli nice design fea...


##### Now, Exporting to a new CSV file

In [30]:
reviews.to_csv('reviews_cleaned.csv', index=False)