<a href="https://colab.research.google.com/github/rnabilahusna/MP_ProductReview/blob/main/MP2_sentiment_analysis_skincare_product_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **1.0 Mount Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## **2.0 Import Neccessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
%matplotlib inline
sns.set_style("whitegrid")

#set warning
import warnings
warnings.filterwarnings('ignore')

pd.pandas.set_option('display.max_columns', None)

## **3.0 Load Data**

In [9]:
# Specify the path to the concatenated file
concatenated_file_path = '/content/drive/MyDrive/Colab Notebooks/MP1_ProductReview/concatenated_data_new.csv'

# Load the concatenated file into a DataFrame
amazon_reviews = pd.read_csv(concatenated_file_path)

# Make a copy of raw data into a df DataFrame
df = amazon_reviews.copy()

In [10]:
# Output data
df.head(5)

Unnamed: 0,product,date,title,rating,body,helpful_count
0,Amazon.com: Customer reviews: CeraVe Foaming F...,28/07/2021,1.0 out of 5 stars\nthere’s no way these revie...,1.0,I bought this because I’ve had really oily ski...,2
1,Amazon.com: Customer reviews: CeraVe Foaming F...,23/07/2023,1.0 out of 5 stars\nNot paraben free,1.0,I don’t understand how these brand name compan...,3
2,Amazon.com: Customer reviews: CeraVe Foaming F...,03/03/2022,1.0 out of 5 stars\nCaused reaction- proceed w...,1.0,Bought this for my child to use as a face wash...,10
3,Amazon.com: Customer reviews: CeraVe Foaming F...,28/06/2020,1.0 out of 5 stars\nSlimey,1.0,"So, you know that slime you make when you mix ...",6
4,Amazon.com: Customer reviews: CeraVe Foaming F...,10/12/2023,1.0 out of 5 stars\nProducto abierto,1.0,El producto venía abierto y quedó todo el líqu...,0


## **4.0 Data Exploration**

### **4.1 Dataset Shape**

In [13]:
# Dataset shape (row, column)
df.shape

(15641, 6)

### **4.2 Missing Values**

In [12]:
# Check for missing values
df.isna().sum()

product           0
date              0
title             0
rating            0
body             60
helpful_count     0
dtype: int64

### **4.3 Duplicated Rows**

In [None]:
# Check for duplicate rows
num_duplicate_rows = df.duplicated().sum()
num_duplicate_rows

## **5.0 Data Preparation**

###**5.1 Rename Column**

In [None]:
# Rename columns to make the dataset easier to understand
df.rename(columns={'product': 'Product'}, inplace=True)
df.rename(columns={'date': 'Date'}, inplace=True)
df.rename(columns={'title': 'Title'}, inplace=True)
df.rename(columns={'rating': 'Rating'}, inplace=True)
df.rename(columns={'body': 'Review'}, inplace=True)
df.rename(columns={'helpful_count': 'Helpful Count'}, inplace=True)

### **5.2 Add New Columns - Skin Concern, Brand, Product Type**

In [None]:
# Mapping dictionary (replace these values with your actual mapping)
product_to_skin_concern = {
    # acne-oily-skin
    'Amazon.com: Customer reviews: CeraVe Foaming Facial Cleanser | Daily Face Wash for Oily Skin with Hyaluronic Acid, Ceramides, and Niacinamide| Fragrance Free Paraben Free | 19 Fluid Ounce': 'acne and oily',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Purifying Foaming Gel Cleanser for Oily Skin, pH Balancing Daily Face Wash, Oil Free and Soap Free, 13.52 Fl Oz (Pack of 1)': 'acne and oily',
    'Amazon.com: Customer reviews: CeraVe Salicylic Acid Acne Treatment with Glycolic Acid and Lactic Acid | AHA/BHA Acne Gel for Face to Control and Clear Breakouts |1.35 Ounce': 'acne and oily',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Duo Dual Action Acne Spot Treatment Cream with Benzoyl Peroxide Acne Treatment for Acne and Blackheads, Lightweight Sheerness, Safe For Sensitive Skin ,0.7 Fl Oz' : 'acne and oily',
    'Amazon.com: Customer reviews: CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce' : 'acne and oily',
    'Amazon.com: Customer reviews: La Roche-Posay Mat Oil-Free Mattifying Moisturizer' : 'acne and oily',
    'Amazon.com: Customer reviews: CeraVe Retinol Serum for Post-Acne Marks and Skin Texture | Pore Refining, Resurfacing, Brightening Facial Serum with Retinol and Niacinamide | Fragrance Free, Paraben Free &amp; Non-Comedogenic| 1 Oz' : 'acne and oily',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Salicylic Acid Acne Treatment to Minimize Pores, Clear Acne Blemishes and Post Acne Marks' : 'acne and oily',

    # anti-aging skin
    'Amazon.com: Customer reviews: CeraVe Anti Aging Face Cream with SPF 30 Sunscreen | Anti Wrinkle Cream for Face with Retinol, SPF 30 Sunscreen, Hyaluronic Acid, and Ceramides | White,1.76 Ounce (Pack of 1)' : 'anti-aging',
    'Amazon.com: Customer reviews: La Roche-Posay Redermic R Anti Aging Retinol Cream, Reduces Wrinkles, Fine Lines, and Age Spots with Pure Retinol Face Cream, 1 Fl Oz' : 'anti-aging',
    'Amazon.com: Customer reviews: CeraVe Eye Cream for Wrinkles | Under Eye Cream with Caffeine, Peptides, Hyaluronic Acid, Niacinamide, and Ceramides for Fine Lines | Fragrance Free &amp; Ophthalmologist Tested |0.5 Ounces' : 'anti-aging',
    'Amazon.com: Customer reviews: La Roche-Posay Pigmentclar Dark Circles Eye Cream with Caffeine, Brightens Under Eye Area and Targets Dark Circles' : 'anti-aging',
    'Amazon.com: Customer reviews: CeraVe Anti Aging Retinol Serum | Cream Serum for Smoothing Fine Lines and Skin Brightening | With Retinol, Hyaluronic Acid, Niacinamide, and Ceramides | 1 Ounce' : 'anti-aging',
    'Amazon.com: Customer reviews: La Roche-Posay Pure Retinol Face Serum with Vitamin B3. Anti Aging Face Serum for Lines, Wrinkles &amp; Premature Sun Damage to Resurface &amp; Hydrate. Suitable for Sensitive Skin, 1.0 Fl. Oz' : 'anti-aging',

    # dry skin
    'Amazon.com: Customer reviews: CeraVe Hydrating Facial Cleanser | Moisturizing Non-Foaming Face Wash with Hyaluronic Acid, Ceramides and Glycerin | Fragrance Free Paraben Free | 16 Fluid Ounce' : 'dry',
    'Amazon.com: Customer reviews: La Roche-Posay Toleriane Hydrating Gentle Face Cleanser, Daily Facial Cleanser with Niacinamide and Ceramides for Sensitive Skin, Moisturizing Face Wash for Normal to Dry Skin, Fragrance Free' : 'dry',
    'Amazon.com: Customer reviews: CeraVe Daily Moisturizing Lotion for Dry Skin | Body Lotion &amp; Facial Moisturizer with Hyaluronic Acid and Ceramides | Fragrance Free | 19 Ounce' : 'dry',
    'Amazon.com: Customer reviews: La Roche-Posay Toleriane Double Repair Face Moisturizer, Daily Moisturizer Face Cream with Ceramide and Niacinamide for All Skin Types, Oil Free, Fragrance Free' : 'dry',
    'Amazon.com: Customer reviews: Cerave Hyaluronic Acid Serum for Face with Vitamin B5 and Ceramides | Hydrating Face Serum for Dry Skin | Fragrance Free | 1 Ounce' : 'dry',
    'Amazon.com: Customer reviews: La Roche-Posay Hyalu B5 Pure Hyaluronic Acid Serum for Face, with Vitamin B5, Anti-Aging Serum for Fine Lines and Wrinkles, Hydrating Serum to Plump and Repair Dry Skin, Safe on Sensitive Skin' : 'dry',
}

product_to_brand = {
    # acne-oily-skin
    'Amazon.com: Customer reviews: CeraVe Foaming Facial Cleanser | Daily Face Wash for Oily Skin with Hyaluronic Acid, Ceramides, and Niacinamide| Fragrance Free Paraben Free | 19 Fluid Ounce': 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Purifying Foaming Gel Cleanser for Oily Skin, pH Balancing Daily Face Wash, Oil Free and Soap Free, 13.52 Fl Oz (Pack of 1)': 'La Roche-Posay',
    'Amazon.com: Customer reviews: CeraVe Salicylic Acid Acne Treatment with Glycolic Acid and Lactic Acid | AHA/BHA Acne Gel for Face to Control and Clear Breakouts |1.35 Ounce': 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Duo Dual Action Acne Spot Treatment Cream with Benzoyl Peroxide Acne Treatment for Acne and Blackheads, Lightweight Sheerness, Safe For Sensitive Skin ,0.7 Fl Oz' : 'La Roche-Posay',
    'Amazon.com: Customer reviews: CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Mat Oil-Free Mattifying Moisturizer' : 'La Roche-Posay',
    'Amazon.com: Customer reviews: CeraVe Retinol Serum for Post-Acne Marks and Skin Texture | Pore Refining, Resurfacing, Brightening Facial Serum with Retinol and Niacinamide | Fragrance Free, Paraben Free &amp; Non-Comedogenic| 1 Oz' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Salicylic Acid Acne Treatment to Minimize Pores, Clear Acne Blemishes and Post Acne Marks' : 'La Roche-Posay',

    # anti-aging skin
    'Amazon.com: Customer reviews: CeraVe Anti Aging Face Cream with SPF 30 Sunscreen | Anti Wrinkle Cream for Face with Retinol, SPF 30 Sunscreen, Hyaluronic Acid, and Ceramides | White,1.76 Ounce (Pack of 1)' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Redermic R Anti Aging Retinol Cream, Reduces Wrinkles, Fine Lines, and Age Spots with Pure Retinol Face Cream, 1 Fl Oz' : 'La Roche-Posay',
    'Amazon.com: Customer reviews: CeraVe Eye Cream for Wrinkles | Under Eye Cream with Caffeine, Peptides, Hyaluronic Acid, Niacinamide, and Ceramides for Fine Lines | Fragrance Free &amp; Ophthalmologist Tested |0.5 Ounces' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Pigmentclar Dark Circles Eye Cream with Caffeine, Brightens Under Eye Area and Targets Dark Circles' : 'La Roche-Posay',
    'Amazon.com: Customer reviews: CeraVe Anti Aging Retinol Serum | Cream Serum for Smoothing Fine Lines and Skin Brightening | With Retinol, Hyaluronic Acid, Niacinamide, and Ceramides | 1 Ounce' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Pure Retinol Face Serum with Vitamin B3. Anti Aging Face Serum for Lines, Wrinkles &amp; Premature Sun Damage to Resurface &amp; Hydrate. Suitable for Sensitive Skin, 1.0 Fl. Oz' : 'La Roche-Posay',

    # dry skin
    'Amazon.com: Customer reviews: CeraVe Hydrating Facial Cleanser | Moisturizing Non-Foaming Face Wash with Hyaluronic Acid, Ceramides and Glycerin | Fragrance Free Paraben Free | 16 Fluid Ounce' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Toleriane Hydrating Gentle Face Cleanser, Daily Facial Cleanser with Niacinamide and Ceramides for Sensitive Skin, Moisturizing Face Wash for Normal to Dry Skin, Fragrance Free' : 'La Roche-Posay',
    'Amazon.com: Customer reviews: CeraVe Daily Moisturizing Lotion for Dry Skin | Body Lotion &amp; Facial Moisturizer with Hyaluronic Acid and Ceramides | Fragrance Free | 19 Ounce' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Toleriane Double Repair Face Moisturizer, Daily Moisturizer Face Cream with Ceramide and Niacinamide for All Skin Types, Oil Free, Fragrance Free' : 'La Roche-Posay',
    'Amazon.com: Customer reviews: Cerave Hyaluronic Acid Serum for Face with Vitamin B5 and Ceramides | Hydrating Face Serum for Dry Skin | Fragrance Free | 1 Ounce' : 'CeraVe',
    'Amazon.com: Customer reviews: La Roche-Posay Hyalu B5 Pure Hyaluronic Acid Serum for Face, with Vitamin B5, Anti-Aging Serum for Fine Lines and Wrinkles, Hydrating Serum to Plump and Repair Dry Skin, Safe on Sensitive Skin' : 'La Roche-Posay',
}

product_to_product_type = {
    # acne-oily-skin
    'Amazon.com: Customer reviews: CeraVe Foaming Facial Cleanser | Daily Face Wash for Oily Skin with Hyaluronic Acid, Ceramides, and Niacinamide| Fragrance Free Paraben Free | 19 Fluid Ounce': 'Cleanser',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Purifying Foaming Gel Cleanser for Oily Skin, pH Balancing Daily Face Wash, Oil Free and Soap Free, 13.52 Fl Oz (Pack of 1)': 'Cleanser',
    'Amazon.com: Customer reviews: CeraVe Salicylic Acid Acne Treatment with Glycolic Acid and Lactic Acid | AHA/BHA Acne Gel for Face to Control and Clear Breakouts |1.35 Ounce': 'Cream',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Duo Dual Action Acne Spot Treatment Cream with Benzoyl Peroxide Acne Treatment for Acne and Blackheads, Lightweight Sheerness, Safe For Sensitive Skin ,0.7 Fl Oz' : 'Cream',
    'Amazon.com: Customer reviews: CeraVe AM Facial Moisturizing Lotion SPF 30 | Oil-Free Face Moisturizer with Sunscreen | Non-Comedogenic | 3 Ounce' : 'Moisturizer',
    'Amazon.com: Customer reviews: La Roche-Posay Mat Oil-Free Mattifying Moisturizer' : 'Moisturizer',
    'Amazon.com: Customer reviews: CeraVe Retinol Serum for Post-Acne Marks and Skin Texture | Pore Refining, Resurfacing, Brightening Facial Serum with Retinol and Niacinamide | Fragrance Free, Paraben Free &amp; Non-Comedogenic| 1 Oz' : 'Serum',
    'Amazon.com: Customer reviews: La Roche-Posay Effaclar Salicylic Acid Acne Treatment to Minimize Pores, Clear Acne Blemishes and Post Acne Marks' : 'Serum',

    # anti-aging skin
    'Amazon.com: Customer reviews: CeraVe Anti Aging Face Cream with SPF 30 Sunscreen | Anti Wrinkle Cream for Face with Retinol, SPF 30 Sunscreen, Hyaluronic Acid, and Ceramides | White,1.76 Ounce (Pack of 1)' : 'Cream',
    'Amazon.com: Customer reviews: La Roche-Posay Redermic R Anti Aging Retinol Cream, Reduces Wrinkles, Fine Lines, and Age Spots with Pure Retinol Face Cream, 1 Fl Oz' : 'Cream',
    'Amazon.com: Customer reviews: CeraVe Eye Cream for Wrinkles | Under Eye Cream with Caffeine, Peptides, Hyaluronic Acid, Niacinamide, and Ceramides for Fine Lines | Fragrance Free &amp; Ophthalmologist Tested |0.5 Ounces' : 'Eye Cream',
    'Amazon.com: Customer reviews: La Roche-Posay Pigmentclar Dark Circles Eye Cream with Caffeine, Brightens Under Eye Area and Targets Dark Circles' : 'Eye Cream',
    'Amazon.com: Customer reviews: CeraVe Anti Aging Retinol Serum | Cream Serum for Smoothing Fine Lines and Skin Brightening | With Retinol, Hyaluronic Acid, Niacinamide, and Ceramides | 1 Ounce' : 'Serum',
    'Amazon.com: Customer reviews: La Roche-Posay Pure Retinol Face Serum with Vitamin B3. Anti Aging Face Serum for Lines, Wrinkles &amp; Premature Sun Damage to Resurface &amp; Hydrate. Suitable for Sensitive Skin, 1.0 Fl. Oz' : 'Serum',

    # dry skin
    'Amazon.com: Customer reviews: CeraVe Hydrating Facial Cleanser | Moisturizing Non-Foaming Face Wash with Hyaluronic Acid, Ceramides and Glycerin | Fragrance Free Paraben Free | 16 Fluid Ounce' : 'Cleanser',
    'Amazon.com: Customer reviews: La Roche-Posay Toleriane Hydrating Gentle Face Cleanser, Daily Facial Cleanser with Niacinamide and Ceramides for Sensitive Skin, Moisturizing Face Wash for Normal to Dry Skin, Fragrance Free' : 'Cleanser',
    'Amazon.com: Customer reviews: CeraVe Daily Moisturizing Lotion for Dry Skin | Body Lotion &amp; Facial Moisturizer with Hyaluronic Acid and Ceramides | Fragrance Free | 19 Ounce' : 'Moisturizer',
    'Amazon.com: Customer reviews: La Roche-Posay Toleriane Double Repair Face Moisturizer, Daily Moisturizer Face Cream with Ceramide and Niacinamide for All Skin Types, Oil Free, Fragrance Free' : 'Moisturizer',
    'Amazon.com: Customer reviews: Cerave Hyaluronic Acid Serum for Face with Vitamin B5 and Ceramides | Hydrating Face Serum for Dry Skin | Fragrance Free | 1 Ounce' : 'Serum',
    'Amazon.com: Customer reviews: La Roche-Posay Hyalu B5 Pure Hyaluronic Acid Serum for Face, with Vitamin B5, Anti-Aging Serum for Fine Lines and Wrinkles, Hydrating Serum to Plump and Repair Dry Skin, Safe on Sensitive Skin' : 'Serum',
}

# Map 'product' column values to 'skin_concerns' based on the dictionary
df['Skin Concern'] = df['Product'].map(product_to_skin_concern)
df['Brand'] = df['Product'].map(product_to_brand)
df['Product Type'] = df['Product'].map(product_to_product_type)


### **5.3 Drop Unnecessary Columns**

In [None]:
# Drop the not needed columns for analysis
df.drop(columns = ['Product'], inplace = True)
df.drop(columns = ['Helpful Count'], inplace = True)

### **5.4 Split Date Column**

In [None]:
# Split the 'Date' column into day, month, and year
df[['Day', 'Month', 'Year']] = df['Date'].str.split('/', expand=True)
df=df.drop(['Date'], axis=1)

### **5.5 Concatenate Title with Review**

In [None]:
# Extract subtitle from 'Title' column and concatenate with main review text
df['Title'] = df['Title'].apply(lambda x: x.split('\n')[1] if '\n' in x else '')
df['Review'] = df['Review']+df['Title']
df = df.drop(['Title'], axis=1)

### **5.6 Translate Non-English Reviews**

In [None]:
!pip install langdetect
!pip install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator
import pandas as pd
from langdetect import detect

# Load your DataFrame (assuming it's named 'df')
# df = pd.read_csv('your_dataset.csv')

# Initialize the translator
translator = Translator()

# Function to translate non-English reviews to English
def translate_to_english(review):
    try:
        language = detect(review)
        if language != 'en':
            translated_review = translator.translate(review, src=language, dest='en').text
            return translated_review
        else:
            return review
    except:
        return review

# Check and translate non-English reviews in the 'review' column
df['Review'] = df['Review'].apply(lambda x: translate_to_english(x) if isinstance(x, str) else x)

Export Translated Dataset

In [None]:
# OUTPUT FOR CSV
# Define the path where the translated DataFrame want to be saved
output_path = '/content/drive/MyDrive/Colab Notebooks/MP1_ProductReview/MP2_translated_reviews.csv'

# Export the DataFrame to a CSV file
df.to_csv(output_path, index=False)

# Display the exported path
print(f"Cleaned csv data has been exported to: {output_path}")

# FOR XLSX
# Define the path where the translated DataFrame wants to be saved
output_path = '/content/drive/MyDrive/Colab Notebooks/MP1_ProductReview/MP2_translared_data.xlsx'

# Export the DataFrame to an Excel file
df.to_excel(output_path, index=False)

# Display the exported path
print(f"Cleaned xlsx data has been exported to: {output_path}")

## **6.0 Data Pre-Processing**

### **6.1 Data Cleaning**

In [None]:
# Remove rows that contain empty cells
df.dropna(inplace = True)

# Remove duplicates
df.drop_duplicates(inplace = True)

In [None]:
df.shape

### **6.2 Case Folding**

In [None]:
# Convert the 'review' column to lowercase
df['Review'] = df['Review'].str.lower()
df

Adding Additional Features - length of, and percentage of punctuations in the text

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

df['Review Len'] = df['Review'].apply(lambda x: len(x) - x.count(" "))
df['Word Count'] = df['Review'].apply(lambda x: len(str(x).split()))
df['Punct'] = df['Review'].apply(lambda x: count_punct(x))

### **6.3 Tokenization**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
# Tokenize the 'review' column
df['Tokens'] = df['Review'].apply(word_tokenize)

### **6.4 Punctuation and Stopword Removal**

In [None]:
import string
import nltk
from nltk.corpus import stopwords
import requests

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

# Download the stopwords file from GitHub
stopwords_url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt"
response = requests.get(stopwords_url)
stopwords_text = response.text

# Download the custom stopwords file from your repository
custom_stopwords_url = "https://raw.githubusercontent.com/rnabilahusna/MP_ProductReview/main/stopwords-custom.txt"
custom_response = requests.get(custom_stopwords_url)
custom_stopwords_text = custom_response.text

# Define additional custom stopwords
additional_stopwords = ["product", "leaf", "feel", "not"]

# Combine the stopwords from both URLs and add custom stopwords
all_stopwords = set(stopwords.words('english') + stopwords_text.splitlines() + custom_stopwords_text.splitlines() + additional_stopwords)
all_stopwords.remove('not')

# Function to remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation + '’')  # Include the smart quote character
    # Remove punctuation
    text = [word.translate(translator) for word in text]
    # Remove empty strings
    text = [word for word in text if word]  # Filter out empty strings
    return text

# Function to remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in all_stopwords]

# Apply punctuation removal followed by stopwords removal to the "tokens" column
df['NoStopW Tokens'] = df['Tokens'].apply(remove_punctuation)
df['NoStopW Tokens'] = df['NoStopW Tokens'].apply(remove_stopwords)

### **6.5 Lemmatization and Stemming**

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

In [None]:
# Initialize WordNet lemmatizer and Porter stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Define a function to lemmatize tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Define a function to stem tokens
def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

# Apply lemmatization and stemming to each row in the 'NoPunc Tokenized Review' column
df['Lemmatized Tokens'] = df['NoStopW Tokens'].apply(lemmatize_tokens)
df['Stemmed Tokens'] = df['NoStopW Tokens'].apply(stem_tokens)

In [None]:
df.head(5)

Export Cleaned Data

In [None]:
# FOR CSV
# Define the path where the cleaned DataFrame want to be saved
output_path = '/content/drive/MyDrive/Colab Notebooks/MP1_ProductReview/MP2_cleaned_data.csv'

# Export the DataFrame to a CSV file
df.to_csv(output_path, index=False)

# Display the exported path
print(f"Cleaned csv data has been exported to: {output_path}")

# FOR XLSX
# Define the path where the cleaned DataFrame wants to be saved
output_path = '/content/drive/MyDrive/Colab Notebooks/MP1_ProductReview/MP2_cleaned_data.xlsx'

# Export the DataFrame to an Excel file
df.to_excel(output_path, index=False)

# Display the exported path
print(f"Cleaned xlsx data has been exported to: {output_path}")

Import cleaned data

In [None]:
# Specify the path to the concatenated file
concatenated_file_path = '/content/drive/MyDrive/Colab Notebooks/MP2_cleaned_data.csv'

# Load the concatenated file into a DataFrame
cleaned_amazon_reviews = pd.read_csv(concatenated_file_path)

# Make a copy of raw data into a df DataFrame
df = cleaned_amazon_reviews.copy()

## **7.0 Exploratory Data Analysis**

### **7.1 Data Overview**

In [None]:
# Data Overview
print("Data Overview:")
df.head()

In [None]:
print("\nInformation about the data:")
df.info()

### **7.2 Statistical Summary**

### **7.2.1 Descriptive Statistics**

In [None]:
# Frequency distribution for 'rating'
rating_counts = df['Rating'].value_counts()

# Mode for 'rating'
mode_rating = df['Rating'].mode().iloc[0]

# Median for 'rating'
median_rating = df['Rating'].median()

# Range for 'rating'
rating_range = df['Rating'].max() - df['Rating'].min()

# Display the results
print(f"Mode: {mode_rating}")
print(f"Median: {median_rating}")
print(f"Range: {rating_range}")
print("\nRating Descriptive Statistics:")
print("Frequency Distribution:")
print(rating_counts)

In [None]:
# Specify numerical columns
numerical_columns = ['Review Len', 'Word Count']

# Statistical Summary for Numerical Columns
print("\nStatistical Summary for Numerical Columns:")
df[numerical_columns].describe()

### **7.2.2 Non-Parametric Test**

In [None]:
from scipy.stats import mannwhitneyu

# To assess whether there's a significant difference in the distribution of ratings between two brands (independent samples).

# Separate ratings for each brand
cerave_ratings = df[df['Brand'] == 'CeraVe']['Rating']
larocheposay_ratings = df[df['Brand'] == 'La Roche-Posay']['Rating']

# Mann-Whitney U Test
statistic, p_value = mannwhitneyu(cerave_ratings, larocheposay_ratings)

# Display the results
print("\nMann-Whitney U Test:")
print(f"Test Statistic: {statistic}")
print(f"P-Value: {p_value}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("The difference in ratings between CeraVe and La Roche-Posay is statistically significant.")
else:
    print("There is no statistically significant difference in ratings between CeraVe and La Roche-Posay.")

### **7.3 Review Text Length Distribution**

### **7.3.1 CeraVe - Review Text Length**

In [None]:
# CeraVe

import matplotlib.pyplot as plt

# Filter the DataFrame for brand 'CeraVe'
df_cerave = df[df['Brand'] == 'CeraVe']

# Filter out rows with empty review text
df_cerave_filtered = df_cerave.dropna(subset=['Review Len'])

# Plot the histogram
plt.figure(figsize=(10, 6))
plt.hist(df_cerave_filtered['Review Len'], bins=100, color='skyblue', edgecolor='black')
plt.title('CeraVe Review Text Length Distribution')
plt.xlabel('Review Text Length')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### **7.3.2 La Roche-Posay - Review Text Length**

In [None]:
# La Roche-Posay
# Filter the DataFrame for brand 'La Roche-Posay'
df_lrp = df[df['Brand'] == 'La Roche-Posay']

# Filter out rows with empty review text
df_lrp_filtered = df_lrp.dropna(subset=['Review Len'])

# Plot the histogram
plt.figure(figsize=(10, 6))
plt.hist(df_lrp_filtered['Review Len'], bins=100, color='skyblue', edgecolor='black')
plt.title('La Roche-Posay Review Text Length Distribution')
plt.xlabel('Review Text Length')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

## **8.0 Sentiment Analysis**

Creating 'Rating Class' Column

In [None]:
# Figuring out the distribution of categories
df['Rating'].value_counts()

In [None]:
# Deciding the outcome column (product rating) based on the overall score.
# If the score is greater than 3, we take that as high-rated rating
# If the value is less than 3 it is low-rated
# If it is equal to 3, we take that as moderate-rated rating.

def classify_rating(row):
    if row['Rating'] == 3.0:
        val = 'Moderate-rated'
    elif row['Rating'] == 1.0 or row['Rating'] == 2.0:
        val = 'Low-rated'
    elif row['Rating'] == 4.0 or row['Rating'] == 5.0:
        val = 'High-rated'
    else:
        val = -1
    return val

# Applying the function in our new column
# sentiment -> rating_class
df['Rating Class'] = df.apply(classify_rating, axis=1)

In [None]:
df['Rating Class'].value_counts()

### **8.1 TextBlob**

In [None]:
from textblob import TextBlob

# Function to get sentiment polarity from a text
def get_sentiment_polarity(text):
    analysis = TextBlob(str(text))
    return analysis.sentiment.polarity

# Apply sentiment analysis to the 'body' column
df['polarity_score'] = df['lemmatized_review'].apply(get_sentiment_polarity)

In [None]:
# Function to get sentiment label based on polarity score
def get_sentiment_label(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

# Example: Apply sentiment label to the 'polarity_score' column
df['sentiment_label'] = df['polarity_score'].apply(get_sentiment_label)