In [None]:
%pip install requests
%pip install beautifulsoup4

In [None]:
# --------------------- 1. First Topic: Web Scraping ---------------------

import requests
from bs4 import BeautifulSoup

url = 'https://www.emag.ro/search/aliexpress/p1'
response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')
product_links = soup.find_all('a', class_='card-v2-title semibold mrg-btm-xxs js-product-url')

# Raw html content of the page
# print(soup)
for link in product_links:
    print(link['href'])

In [None]:
import re

In [None]:
def get_price_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    scripts = soup.find_all('script')
    for script in scripts:
        if 'EM.product_id' in script.text:
            product_id_search = re.search(r'EM.productDiscountedPrice\s=\s([0-9.]+);', script.text)
            if product_id_search:
                return float(product_id_search.group(1))
    return None


def get_reviews_by_url(product_url):
    # remove potential '?' at the end of the url
    if product_url.endswith('?'):
        product_url = product_url[:-1]

    # remove potential '#used-products' at the end of the url
    if product_url.endswith('#used-products'):
        product_url = product_url[:-len('#used-products')]

    # check for potentially missing slash at the very end
    if not product_url.endswith('/'):
        product_url += '/'

    # --------------------- endpoint URL ---------------------

    substr_to_remove = 'https://www.emag.ro/'
    endpoint_url = 'https://www.emag.ro/product-feedback/'
    endpoint_url += product_url.replace(substr_to_remove, '')
    endpoint_url += 'reviews/list'

    # --------------------- GET request ---------------------

    product_response = requests.get(product_url)
    product_html = product_response.text
    product_soup = BeautifulSoup(product_html, 'html.parser')

    # --------------------- number of reviews ---------------------

    # get the number of reviews. e.g. for 256 we iterate 26 times, for 5 we iterate 1 time
    # res = product_soup.find_all('p', class_='small semibold font-size-sm text-muted')
    # reviews_number = re.search(r'\d+', str(res[0])).group()

    # ^^^^^ this approach got dumped because it was only working for products 
    #       that actually had any number of reviews

    offset = 0

    params = {
        "source_id": 7,
        "page[limit]": 10,
        "page[offset]": offset,
        "sort[created]": "desc"
    }

    response = requests.get(endpoint_url, params=params)
    data = response.json()
    # print(data)
    reviews_number = data['reviews']['count']
    reviews_number = int(reviews_number)

    if reviews_number <= 0:
        return []

    # --------------------- product title ---------------------

    product_title = product_soup.find('h1', class_='page-title').get_text()
    # get rid of multiple whitespaces and \n
    product_title = re.sub(r'\s+', ' ', product_title).strip()

    # --------------------- product price ---------------------

    # product_price = product_soup.find('p', class_='product-new-price').get_text()
    # # get rid of ' Lei'
    # product_price = product_price[:-4] 
    # # transform string '1.920,00' to float '1920.0'
    # product_price = float(product_price.replace('.', '').replace(',', '.')) 

    # ^^^^^ this approach got dumped

    # used this approach instead of looking through the HTML tags (like above)
    # because formats were too many, e.g. 'de la xxx.xx Lei' when multiple offers exist

    scripts = product_soup.find_all('script')
    for script in scripts:
        if 'EM.product_id' in script.text:
            product_id_search = re.search(r'EM.productDiscountedPrice\s=\s([0-9.]+);', script.text)
            if product_id_search:
                product_price = float(product_id_search.group(1))
                break

    # --------------------- get reviews ---------------------

    review_titles_arr = []
    review_ratings_arr = []
    review_contents_arr = []
    review_verified_users_arr = []

    while offset < reviews_number:
        params = {
            "source_id": 7,
            "page[limit]": 10,
            "page[offset]": offset,
            "sort[created]": "desc"
        }

        response = requests.get(endpoint_url, params=params)
        data = response.json()

        items = data['reviews']['items']

        review_titles = [item['title'] for item in items]
        review_ratings = [item['rating'] for item in items]
        review_contents = [item['content'] for item in items]
        review_verified_users = [item['is_bought'] for item in items]

        review_titles_arr += review_titles
        review_ratings_arr += review_ratings
        review_contents_arr += review_contents
        review_verified_users_arr += review_verified_users

        offset += 10

    # --------------------- final product array ---------------------

    merged_list = [
        {
            'product_title': product_title,
            'product_price': product_price,
            'review_title': review_title, 
            'review_rating': review_rating, 
            'review_verified_buyer': review_verified_buyer, 
            'review_content': review_content
        }
        for review_title, review_content, review_rating, review_verified_buyer in zip(review_titles_arr, review_contents_arr, review_ratings_arr, review_verified_users_arr)
    ]

    return merged_list

# product_reviews = get_reviews_by_url('https://www.emag.ro/mouse-wireless-logitech-mx-master-3s-performance-8000-dpi-silent-usb-bt-graphite-910-006559/pd/DZMBWVMBM/')
# for review in product_reviews:
#     print(review)

In [None]:
# --------------------- get current listings' indices ---------------------

def get_current_listings(url):
    page_response = requests.get(url)
    page_html = page_response.text
    page_soup = BeautifulSoup(page_html, 'html.parser')

    try:
        listings = page_soup.find('div', class_='control-label small hidden-xs hidden-sm js-listing-pagination').get_text()
    except AttributeError:
        return [-1, -1]

    if listings is not None:
        # Define the pattern to match the numbers, like:
        # '1 - 5 din 5 produse'
        # '1 - 60 din 262 de produse'
        pattern = r'(\d+)\D+(\d+)\D+(\d+)'

        # Extract the numbers from input1
        match1 = re.search(pattern, listings)
        k1 = int(match1.group(1))
        k2 = int(match1.group(2))

        return [k1, k2]
    

# --------------------- get the total number of listings for the current search ---------------------    

def get_total_listings(url):
    page_response = requests.get(url)
    page_html = page_response.text
    page_soup = BeautifulSoup(page_html, 'html.parser')

    try:
        listings = page_soup.find('div', class_='control-label small hidden-xs hidden-sm js-listing-pagination').get_text()
    except AttributeError:
        return -1

    if listings is not None:
        # Define the pattern to match the numbers, like:
        # '1 - 5 din 5 produse'
        # '1 - 60 din 262 de produse'
        pattern = r'(\d+)\D+(\d+)\D+(\d+)'

        # Extract the numbers from input1
        match1 = re.search(pattern, listings)
        k = int(match1.group(3))

        return k

In [None]:
# --------------------- get current listings' URLs ---------------------

def get_product_links(url):
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')

    product_links = soup.find_all('a', class_='card-v2-title semibold mrg-btm-xxs js-product-url')    

    return product_links

In [None]:
import math

# --------------------- main function for getting all reviews for all search results ---------------------

def get_total_reviews(url):
    total_reviews = []
    total_products = get_total_listings(url)
    print(f"total products: {total_products}")

    if total_products > 0:
        rounds = math.ceil(float(total_products / 60))
        print(f"rounds: {rounds}")

        k = 0
        page_number = 1
        final_url = url

        [from_product, to_product] = get_current_listings(url)

        for i in range(rounds):
            print(f"from: {from_product}, to: {to_product}, total: {total_products}")
            product_links = get_product_links(final_url)

            if i == rounds - 1:
                # facem de total % 60 ori
                limit = total_products % 60
            else:
                # facem de 60 de ori
                limit = 60

            for j, item in enumerate(product_links):
                if j >= limit:
                    break
                else:
                    print(f"getting reviews for {item['href']}")
                    total_reviews += get_reviews_by_url(item['href'])

            page_number += 1
            
            final_url = f'{url}p{page_number}'
            [from_product, to_product] = get_current_listings(final_url)
            


    return total_reviews

In [None]:
# url = 'https://www.emag.ro/search/monitor dell/' # 140 rez
# url = 'https://www.emag.ro/search/monitor benq/' # 81 rez
url = 'https://www.emag.ro/search/sony xm5/' # 3 rez
# url = 'https://www.emag.ro/search/iphone 14 pro max/p1' # 108 rez
# url = 'https://www.emag.ro/search/macbook pro stand lemn maro negru/'
    # MUST HAVE / AT END!!!!!

reviews = get_total_reviews(url)

if reviews is not None:
    for review in reviews:
        print(review)


In [None]:
import csv

csv_file = 'reviews_output.csv'

header = reviews[0].keys()

with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    writer.writerows(reviews)

print(f"Data has been written to {csv_file}")

In [None]:
import pandas as pd

# Path to the CSV file
csv_file = 'reviews_output.csv'

# Define the column names and their respective data types
column_names = ['product_title', 'product_price', 'review_title', 'review_rating', 'review_verified_buyer', 'review_content']
column_types = {'product_title': str, 'product_price': float, 'review_title': str, 'review_rating': int, 'review_verified_buyer': bool, 'review_content': str}

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file, names=column_names, dtype=column_types, header=0)

In [None]:
%pip install pandas
%pip install demoji
%pip install nltk
%pip install googletrans
%pip install deepl

In [None]:
# --------------------- TRANSLATE TO ENGLISH ---------------------

import deepl

auth_key = "a25f32ec-b831-8e00-7f0a-e0d2f6fc6a4b:fx"  # Replace with your key
translator = deepl.Translator(auth_key)

def translate_to_english(text):
    result = translator.translate_text(text, source_lang="RO", target_lang="EN-US")
    return str(result)


# # Print the updated DataFrame
# print(df[['review_content', 'review_content_en']])

# output_file = 'translated_output.csv'

# df.to_csv(output_file, index=False)

In [None]:
# FOR CONTENT

df['review_content_en'] = df['review_content'].apply(translate_to_english)
df = df.drop('review_content', axis=1)

In [None]:
# FOR TITLE
df['review_title_en'] = df['review_title'].apply(translate_to_english)
df = df.drop('review_title', axis=1)

output_file = 'translated_output.csv'

df.to_csv(output_file, index=False)

In [None]:
# --------------------- 2. Second Topic: Sentiment Analysis ---------------------

from nltk.sentiment import SentimentIntensityAnalyzer

# Create a SentimentIntensityAnalyzer object
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis on 'review_content_en' column
df['sentiment_score'] = df['review_content_en'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Classify sentiments based on the sentiment score
df['sentiment_label'] = df['sentiment_score'].apply(lambda x: 'Positive' if x >= 0 else 'Negative')

# Print the updated DataFrame
print(df[['review_content_en', 'sentiment_score', 'sentiment_label']])

output_file = 'sentiment_output.csv'

df.to_csv(output_file, index=False)

In [106]:
# --------------------- 3. Third Topic: Data Cleaning & Preprocessing ---------------------

import html
import string
import unicodedata
import demoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')

# Function to remove miscellaneous characters
# def remove_miscellaneous_chars(text):
#     if isinstance(text, str):
#         # Update the regular expression pattern to not remove hyphens inside words
#         cleaned_text = re.sub(r"([^\w\s-]|(?<!\w)-(?!\w))", "", text)
#         return cleaned_text
#     return text

# # Function to remove emojis
# def remove_emojis(text):
#     if isinstance(text, str):
#         cleaned_text = demoji.replace(text, "")
#         return cleaned_text
#     return text

# # Function to remove HTML tags
# def remove_html_tags(text):
#     if isinstance(text, str):
#         cleaned_text = re.sub(r"<.*?>", "", text)
#         return cleaned_text
#     return text


# Function for text normalization, stemming, and stop word removal
def preprocess_text(text):
    text = str(text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))  # Update the language if necessary
    tokens = [token for token in tokens if token not in stop_words]
    
    # Perform stemming
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/roberthevesi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/roberthevesi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [110]:
df = df_saved

In [115]:
print(df_saved)

                                         product_title  product_price  \
0    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
1    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
2    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
3    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
4    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
..                                                 ...            ...   
136  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
137  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
138  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
139  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
140  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   

                         review_title  review_rating  review_verified_buyer  \
0                      foart multumit       

In [109]:


# Apply data cleaning and preprocessing


df['review_title'] = df['review_title'].apply(remove_miscellaneous_chars)
df['review_title'] = df['review_title'].apply(remove_emojis)
df['review_title'] = df['review_title'].apply(remove_html_tags)
df['review_title'] = df['review_title'].apply(preprocess_text)

df['review_content_en'] = df['review_content_en'].apply(remove_miscellaneous_chars)
df['review_content_en'] = df['review_content_en'].apply(remove_emojis)
df['review_content_en'] = df['review_content_en'].apply(remove_html_tags)
df['review_content_en'] = df['review_content_en'].apply(preprocess_text)


# Print the updated DataFrame
print(df)

output_file = 'cleaned_output.csv'

df.to_csv(output_file, index=False)

                                         product_title  product_price  \
0    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
1    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
2    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
3    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
4    Casti Over the Ear Sony WH-1000XM5B, Wireless,...        1799.99   
..                                                 ...            ...   
136  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
137  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
138  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
139  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   
140  Casti Over the Ear Sony WH-1000XM5S, Wireless,...        1799.99   

                         review_title  review_rating  review_verified_buyer  \
0                      foart multumit       

In [None]:
print(df)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert 'review_content_en' column to string type
df['review_content_en'] = df['review_content_en'].astype(str)

# Create an instance of TfidfVectorizer
tfidf = TfidfVectorizer()

# Fit and transform the 'review_content_en' column using TF-IDF
tfidf_matrix = tfidf.fit_transform(df['review_content_en'])

# Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names())

# Print the TF-IDF DataFrame
print(tfidf_df)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# # Create an instance of TfidfVectorizer
# tfidf = TfidfVectorizer()

# # Fit and transform the 'review_content' column using TF-IDF
# tfidf_matrix = tfidf.fit_transform(df['review_content_en'])

# # Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Choose the number of sentences to include in the summary
num_sentences = 3

# Generate summaries for each review
summaries = []
for i, row in df.iterrows():
    review = row['review_content_en']
    sentence_scores = [(sentence, cosine_sim[i].mean()) for sentence in review.split('.')]
    sentence_scores.sort(key=lambda x: x[1], reverse=True)
    summary = ' '.join([sentence_score[0] for sentence_score in sentence_scores[:num_sentences]])
    summaries.append(summary)

# Add the summaries to the DataFrame
df['summary'] = summaries

# Print the DataFrame with summaries
text_file = 'test.txt'

# Export the DataFrame to a text file
df.to_csv(text_file, sep='\t', index=False)

In [None]:
%pip install gensim

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess

# Preprocess the text
documents = df['review_content_en'].apply(simple_preprocess)

# Create a dictionary representation of the documents
dictionary = Dictionary(documents)

# Create a Bag-of-Words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Build the LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42)

# Print the topics and their top keywords
for topic_id, topic_keywords in lda_model.print_topics():
    print(f'Topic {topic_id + 1}: {topic_keywords}')

# Calculate the coherence score
coherence_model = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score: {coherence_score}')