In [25]:
import pandas as pd

In [26]:
# Raw Data
reviews = [
    'Wow!!! 😍 This product exceeded my expectations—amazing quality, super fast delivery 🚚💨, and the packaging was so thoughtful 🎁✨',
    'The product quality is excellent 😊, and the delivery was super fast!!! 🚚✨',
    'I had a terrible experience 😡... the item arrived damaged 😢, and customer service was unhelpful. 😤',
    'Not as described—very disappointed with the size and color.',
    'The app is user-friendly and intuitive 😊, but it crashes occasionally... 😕💻',
]

# Cover to DataFrame.
df = pd.DataFrame(reviews, columns = ['Review'])

In [27]:
# Convert into lower-case.
df['Review_Lowercase'] =  df['Review'].str.lower()

In [28]:
df

Unnamed: 0,Review,Review_Lowercase
0,Wow!!! 😍 This product exceeded my expectations...,wow!!! 😍 this product exceeded my expectations...
1,"The product quality is excellent 😊, and the de...","the product quality is excellent 😊, and the de..."
2,I had a terrible experience 😡... the item arri...,i had a terrible experience 😡... the item arri...
3,Not as described—very disappointed with the si...,not as described—very disappointed with the si...
4,"The app is user-friendly and intuitive 😊, but ...","the app is user-friendly and intuitive 😊, but ..."


#### Removing Punctuation and Emojis

##### Goal: Clean out unnecessary characters like punctuation and emojis to simplify analysis

In [29]:
import re
df['Review_NO_Punct_Emoji'] = df['Review_Lowercase'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [30]:
df

Unnamed: 0,Review,Review_Lowercase,Review_NO_Punct_Emoji
0,Wow!!! 😍 This product exceeded my expectations...,wow!!! 😍 this product exceeded my expectations...,wow this product exceeded my expectationsamaz...
1,"The product quality is excellent 😊, and the de...","the product quality is excellent 😊, and the de...",the product quality is excellent and the deli...
2,I had a terrible experience 😡... the item arri...,i had a terrible experience 😡... the item arri...,i had a terrible experience the item arrived ...
3,Not as described—very disappointed with the si...,not as described—very disappointed with the si...,not as describedvery disappointed with the siz...
4,"The app is user-friendly and intuitive 😊, but ...","the app is user-friendly and intuitive 😊, but ...",the app is userfriendly and intuitive but it ...


In [31]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load stopwords list
stop_words = set(stopwords.words('english'))

# Function to remove stop_words.
def remove_stop_words(text):
    # Tokenize the text.
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    return ' '.join(filtered_tokens) # filtered_tokens is a list, we are
    
# Apply 'remove_stop_words' function to the column
df['NO_Stopwords'] = df['Review_NO_Punct_Emoji'].apply(remove_stop_words) # apply(): Applies function to each row of a column.

[nltk_data] Downloading package stopwords to C:\Users\Sohail
[nltk_data]     Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
df

Unnamed: 0,Review,Review_Lowercase,Review_NO_Punct_Emoji,NO_Stopwords
0,Wow!!! 😍 This product exceeded my expectations...,wow!!! 😍 this product exceeded my expectations...,wow this product exceeded my expectationsamaz...,wow product exceeded expectationsamazing quali...
1,"The product quality is excellent 😊, and the de...","the product quality is excellent 😊, and the de...",the product quality is excellent and the deli...,product quality excellent delivery super fast
2,I had a terrible experience 😡... the item arri...,i had a terrible experience 😡... the item arri...,i had a terrible experience the item arrived ...,terrible experience item arrived damaged custo...
3,Not as described—very disappointed with the si...,not as described—very disappointed with the si...,not as describedvery disappointed with the siz...,describedvery disappointed size color
4,"The app is user-friendly and intuitive 😊, but ...","the app is user-friendly and intuitive 😊, but ...",the app is userfriendly and intuitive but it ...,app userfriendly intuitive crashes occasionally


#### Stemming

In [33]:
from nltk.stem import PorterStemmer
def text_stemmer(text):
    stemmer = PorterStemmer()
    # Tokenize Text.
    tokens = text.split()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(stemmed_tokens)

# Apply stem_text to the column
df['Stemmed_Text'] = df['NO_Stopwords'].apply(text_stemmer)

In [34]:
df

Unnamed: 0,Review,Review_Lowercase,Review_NO_Punct_Emoji,NO_Stopwords,Stemmed_Text
0,Wow!!! 😍 This product exceeded my expectations...,wow!!! 😍 this product exceeded my expectations...,wow this product exceeded my expectationsamaz...,wow product exceeded expectationsamazing quali...,wow product exceed expectationsamaz qualiti su...
1,"The product quality is excellent 😊, and the de...","the product quality is excellent 😊, and the de...",the product quality is excellent and the deli...,product quality excellent delivery super fast,product qualiti excel deliveri super fast
2,I had a terrible experience 😡... the item arri...,i had a terrible experience 😡... the item arri...,i had a terrible experience the item arrived ...,terrible experience item arrived damaged custo...,terribl experi item arriv damag custom servic ...
3,Not as described—very disappointed with the si...,not as described—very disappointed with the si...,not as describedvery disappointed with the siz...,describedvery disappointed size color,describedveri disappoint size color
4,"The app is user-friendly and intuitive 😊, but ...","the app is user-friendly and intuitive 😊, but ...",the app is userfriendly and intuitive but it ...,app userfriendly intuitive crashes occasionally,app userfriendli intuit crash occasion
