# Loading Dependencies

In [120]:
#region Data Manipulation

# Pandas
from pandas import read_csv, Series, melt

# Numpy
from numpy import NAN, where

#endregion

#region Data Cleaning

# Regular Expression
from re import findall, search

# Abstract Syntax Tree.
from ast import literal_eval

#endregion

#region Data Visualization

# Plotly Express.
from plotly.express import bar, box, histogram, scatter

#endregion

#region Natural Language Processing

# Natural Language Toolkit.
import nltk

# Tokenization.
from nltk.tokenize import word_tokenize

# Stop-Words.
from nltk.corpus import stopwords

# Token Lemmatization.
from nltk.stem import WordNetLemmatizer

# Sentiment Analysis.
from nltk.sentiment import SentimentIntensityAnalyzer

#endregion

# Static Configurations - Global Variables

In [109]:
#region Data Cleaning

# Price -> String to Float.
PATTERN_PRICE = r'[\d{1,3}(?:,\d{3})*\.\d+|\d+\.\d+]'

# RAM -> String to Integer.
PATTERN_RAM = r'([\d.]+)\s*(MB|GB)?\s*(?:RAM)?'

# SSD -> String to Integer.
PATTERN_SSD = r'([\d.]+)\s*(GB|TB)?'

# Filtering Reviews.
PATTERN_FILTER_REVIEWS = r'\w+|\d+'

#endregion

# Data Exploration

In [44]:
# Fetching the data scrapped.
dataset = read_csv('eBay Laptop Description.csv', index_col = 'Unnamed: 0')

In [45]:
# Taking a first look on the data.
dataset

Unnamed: 0,href,Title,Price,RAM,Brand,Hdd,Condition,Seller notes,SSD,Rating,Reviews
0,https://www.ebay.com/itm/166611579675?epid=132...,Apple Macbook Pro 13.3” 2.5GHz intel Core i5 1...,US $249.00,16 GB,Apple,1 TB,Used: An item that has been used previously. T...,,1 TB,"['4.8', '5.0', '5.0', '5.0']",['I received the MacBook Pro and it was almost...
1,https://www.ebay.com/itm/285754711451?itmmeta=...,"Apple MacBook Pro 15"" A1286 2.3GHz Core i7 16G...",US $219.00,16 GB,Apple,256 GB,Used: An item that has been used previously. T...,,240GB,"['4.9', '5.0', '5.0', '5.0']",['Muy buena computadora. En excelentes condici...
2,https://www.ebay.com/itm/334577965892?itmmeta=...,Apple Macbook Air 13 (2015) | i5 8GB + 512GB S...,US $246.05/ea,8 GB,Apple,256 GB,Used,“2 YEAR WARRANTY INCLUDED!!! LAPTOP ARE FULLY ...,128-512 GB,"['4.8', '5.0', '4.9', '5.0']",['Opened it as soooon as the mail man brought ...
3,https://www.ebay.com/itm/235462992167?itmmeta=...,SONOMA MacBook Pro 15 RETINA / 4.0GHz QUAD COR...,US $665.00/ea,8 GB,Apple,2 TB,Used,“Fully tested and verified! Good condition ove...,2 TB,"['4.9', '5.0', '5.0', '5.0']",['Everything as promised. Fast shipping. Using...
4,https://www.ebay.com/p/20029930090?iid=2260712...,"Apple MacBook Pro 13"" (128GB SSD, Intel Core i...",US $36.00,8GB RAM,Apple,256 GB,,,128GB SSD,[],[]
...,...,...,...,...,...,...,...,...,...,...,...
19453,https://www.ebay.com/itm/285812494815?itmmeta=...,iPad air 4th generation 64gb wifi Sky Blue,US $270.00,64 GB,Apple,64 GB,Used: An item that has been used previously. T...,,256 GB,[],"[""Arrived on time and in good condition, and w..."
19454,https://www.ebay.com/itm/335323518419?itmmeta=...,APPLE MACBOOK AIR A1466 I5-3427U @ 1.8GHz 4GB ...,US $34.99,4 GB,Apple,256 GB,For parts or not working,“Please see the pictures of its cosmetic condi...,256 GB,"['4.8', '4.7', '5.0', '5.0']",['Dell 3590 laptop received as described. Comp...
19455,https://www.ebay.com/itm/305231918484?itmmeta=...,Apple iBook G3/366 M6411 | Apple Mac OS 9.0.4 ...,US $399.00,128 MB,Apple,10 GB,Used,"“Tested to power on. Battery charges, but no g...",256 GB,"['5.0', '4.9', '5.0', '5.0']","['Fast shipping, great communication, and exce..."
19456,https://www.ebay.com/itm/134788641380?itmmeta=...,Apple 2023 MacBook Pro 14 inch M3 chip 512GB S...,"C $2,969.99",8 GB,Apple,512 GB,"New: A brand-new, unused, unopened, undamaged ...",,512 GB,"['4.8', '4.8', '4.9', '4.9']","['Got a used RTX 3070, item was a little dirty..."


<font color = '#FFA500'><h3>Observations:</h3></font>
- First field are just some form of ID and can be removed.
- The `href` link might be useful for recommending a set of laptops in the future work; but for now, dropping it would be a better choice.
- Can extract missing data of a row using `Title` field.
- The `Price` has to be processed to convert it from string to actual floating figure.
- The `Rating` determines the seller reviews regarding product, which might be helpful to determine public views towards laptop.
- Need to clean `Price`, `RAM`, and `SSD` for better understanding of data by converting them from object to numeric form.

In [116]:
dataset.describe()

Unnamed: 0,Price,Accurate Description,Reasonable Shipping Cost,Shipping Speed,Communication,Seller Review Sentiment
count,19458.0,14698.0,14692.0,14639.0,14607.0,19458.0
mean,369.138998,4.865608,4.903927,4.970565,4.958691,0.821667
std,400.673384,0.100771,0.146324,0.071046,0.08211,0.399233
min,0.99,2.6,3.3,3.2,2.7,-1.0
25%,130.0,4.8,4.8,5.0,4.9,1.0
50%,251.95,4.9,5.0,5.0,5.0,1.0
75%,450.64,4.9,5.0,5.0,5.0,1.0
max,7999.0,5.0,5.0,5.0,5.0,1.0


In [117]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19458 entries, 0 to 19457
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   href                      19458 non-null  object 
 1   Title                     19458 non-null  object 
 2   Price                     19458 non-null  float64
 3   RAM                       19458 non-null  object 
 4   Brand                     19454 non-null  object 
 5   Hdd                       19281 non-null  object 
 6   Condition                 16735 non-null  object 
 7   Seller notes              10861 non-null  object 
 8   SSD                       19458 non-null  object 
 9   Rating                    19458 non-null  object 
 10  Reviews                   19458 non-null  object 
 11  Accurate Description      14698 non-null  float64
 12  Reasonable Shipping Cost  14692 non-null  float64
 13  Shipping Speed            14639 non-null  float64
 14  Commun

# Data Cleaning

## Missing Data Identification

In [46]:
# Finding the number of missing values in the dataset.
dataset.isna().sum()

href               0
Title              0
Price              0
RAM              133
Brand              4
Hdd              177
Condition       2723
Seller notes    8597
SSD              182
Rating             0
Reviews            0
dtype: int64

In [47]:
class DataCleaning:

    def __init__(self, pattern : str) -> None:

        self.pattern = pattern
    
    def clean_data_re(self, string : str) -> float | None:
        
        # Preventing from missing values.
        try:

            # Searching the pattern from the given string.
            result = search(pattern = self.pattern, string = str(object = string))
            
            # If the pattern is found, send it.
            if result:
                return result.group()
            
        except:
            pass
        
        return None
    
    def set_pattern(self, pattern : str) -> None:
        
        self.__init__(pattern = pattern)

## Price -> String to Float.

In [114]:
# Creating instance for data cleaning.
data_cleaning = DataCleaning(pattern = PATTERN_PRICE)

# Cleaning the data of prices.
dataset.Price = dataset.Price.apply(data_cleaning.clean_data_re)

# Performing cleaning addon.
dataset.Price = Series([float(findall(pattern = PATTERN_PRICE, string = price)[0].replace(',', '')) for price in dataset.Price])

# Validating the results stored.
dataset.Price

0         249.00
1         219.00
2         246.05
3         665.00
4          36.00
          ...   
19453     270.00
19454      34.99
19455     399.00
19456    2969.99
19457     429.24
Name: Price, Length: 19458, dtype: float64

## RAM -> String to Category.

In [49]:
# Extracting the data based on given pattern for RAM.
data = dataset.RAM.str.extract(pat = PATTERN_RAM, expand = True)

# Dropping the Null Values and merging it in the form of string.
dataset.RAM = data.apply(lambda d : ' '.join(d.dropna().astype(str)), axis = 1)

# Validating the results stored.
dataset.RAM.value_counts()

8 GB       9701
16 GB      4548
4 GB       2573
32 GB       890
2 GB        372
           ... 
7520          1
64            1
40            1
13.3          1
5121 GB       1
Name: RAM, Length: 62, dtype: int64

## SSD -> String to Category.

In [50]:
# Extracting the data based on given pattern for SSD.
data = dataset.SSD.str.extract(pat = PATTERN_SSD, expand = True)

# Dropping the Null Values and merging it in the form of string.
dataset.SSD = data.apply(lambda d : ' '.join(d.dropna().astype(str)), axis = 1)

# Validating the results stored.
dataset.SSD.value_counts()

256 GB    9322
512 GB    2998
128 GB    1742
1 TB      1560
500 GB     639
          ... 
238 GB       1
100 GB       1
62           1
720 GB       1
225          1
Name: SSD, Length: 94, dtype: int64

## Ratings Filtering.

In [102]:
def parse_ratings(ratings : str) -> dict[any]:

    # Defining a set of categories for reviews.
    categories = ['Accurate Description', 'Reasonable Shipping Cost', 'Shipping Speed', 'Communication']
    
    #region Argument Pre-Processing

    # Defining a list of rating values to be filtered.
    rating_values = list()

    # Iterating over each rating.
    for rating in literal_eval(node_or_string = ratings):
    
        # Validating the NaN values.    
        if rating != '--':
            
            # Append the rating data.
            rating_values.append(float(rating))
            
        else:

            # Append NaN.
            rating_values.append(NAN)
        
    #endregion
    
    # Converting the ratings from string to list of integers.
    ratings = [float(rating) for rating in literal_eval(ratings) if rating != '--']

    # Instance parsing ratings.
    rating_values = dict()
    
    # Iterating over each category to feed the reviews.
    for iteration, category in enumerate(categories):
        
        # Storing review data based on category.
        rating_values[category] = ratings[iteration] if len(ratings) > iteration else NAN
    
    # Providing the rating values filtered using this function.
    return rating_values

In [103]:
# Parsing Ratings.
dataset[['Accurate Description', 'Reasonable Shipping Cost', 'Shipping Speed', 'Communication']] = dataset.Rating.apply(func = parse_ratings).apply(func = Series)

# Validating the results stored.
dataset[['Accurate Description', 'Reasonable Shipping Cost', 'Shipping Speed', 'Communication']]

Unnamed: 0,Accurate Description,Reasonable Shipping Cost,Shipping Speed,Communication
0,4.8,5.0,5.0,5.0
1,4.9,5.0,5.0,5.0
2,4.8,5.0,4.9,5.0
3,4.9,5.0,5.0,5.0
4,,,,
...,...,...,...,...
19453,,,,
19454,4.8,4.7,5.0,5.0
19455,5.0,4.9,5.0,5.0
19456,4.8,4.8,4.9,4.9


## Seller Review Sentiments (NLP)

In [53]:
def filter_reviews(reviews_data : str) -> str:
    
    # Word Tokenization.
    filtered_reviews_tokens = [review_token for review_token in word_tokenize(reviews_data.lower()) if review_token not in stopwords.words('english')]

    # Token Lemmatization.
    lemmatizer = WordNetLemmatizer()
    lemmatized_reviews = ' '.join([lemmatizer.lemmatize(review_token) for review_token in filtered_reviews_tokens])

    # Only considering words having some character or string.
    return ' '.join(findall(pattern = PATTERN_FILTER_REVIEWS, string = lemmatized_reviews))

def is_review_positive(review: str) -> bool:

    # Calculate the Polarity Score for sentiment.
    review_sentiment = SentimentIntensityAnalyzer().polarity_scores(review)

    #                          v Positive v                                        v Negative v     Neutral v
    return 1 if review_sentiment['compound'] >= 0.05 else (-1 if review_sentiment['compound'] <= -0.05 else 0)

In [54]:
# Predicting the seller's impression based on the reviews posted by users.
dataset['Seller Review Sentiment'] = dataset['Reviews'].apply(lambda data : filter_reviews(data)).apply(is_review_positive)

# Validating the results stored.
dataset['Seller Review Sentiment']

0        1
1        1
2        1
3        1
4        0
        ..
19453    1
19454    1
19455    1
19456    1
19457    1
Name: Seller Review Sentiment, Length: 19458, dtype: int64

## Outlier Detection

### Visualizing Outliers

In [118]:
# Plotting a box plot for our numeric features.
box(data_frame = dataset.Price).show()

In [123]:
histogram(data_frame = dataset.Price).show()

In [124]:
scatter(data_frame = dataset.Price).show()

## Outlier Operation - Omission

### Quantile-Based Flooring and Capping

In [121]:
for feature in dataset:

    #region Outlier Operation

    # Evaluating the floor and cap quantiles.
    floor = dataset[feature].quantile(q = 0.1)
    cap = dataset[feature].quantile(q = 0.9)
    
    # Removing the outliers.
    dataset[feature] = where(dataset[feature] < floor, floor, dataset[feature])
    dataset[feature] = where(dataset[feature] > cap, cap, dataset[feature])
    
    #endregion

# Data Visualization

## RAM Demand.

In [125]:
# Top 6 Number of RAM in the Dataset.
bar(data_frame = dataset.RAM.value_counts()[ : 6],
    title = 'RAM Demands - Top 6 on Sale!',
    labels = {
        'x' : 'RAM (GB)',
        'y' : 'Count'
    },
    text = dataset.RAM.value_counts()[ : 6].values).show()

<font color = '#FFA500'><h3>Observations:</h3></font>
- 8 GB RAM is the most popular amongst laptops.
- However, people do not prefer much to extend for 32 GB and rarely tough 64 GB.
- Unfortunately, 4 GB RAM packages are also available in the market.

## SSD Demand.

In [126]:
# Plotting top 5 SSDs on sale.
bar(data_frame = dataset.SSD.value_counts()[ : 5],
    title = 'SSD Demands - Top 5 on Sale!',
    labels = {
        'x' : 'SSD Size',
        'y' : 'Count'
    },
    text = dataset.SSD.value_counts()[ : 5].values).show()

<font color = '#FFA500'><h3>Observations:</h3></font>
- 256 GB storage is the most preferable option for people considering its cheaper price and faster usage than HDD.
- For the most part, people tend to use laptops for low-computational task such as watching movies.
- Yet, a few other trendy storages are 512 GB, 128 GB, and 1 TB.

# Model Implementation