# Loading Dependencies

In [15]:
# Data Manipulation.
from pandas import read_csv, Series

# Data Cleaning.
from re import findall, search

# Static Configurations - Global Variables

In [24]:
#region Data Cleaning

# Price -> String to Float.
PATTERN_PRICE = r'\d{1,3}(?:,\d{3})*\.\d+|\d+\.\d+'

# RAM -> String to Integer.
PATTERN_RAM = r'(\d+)\s*(TB|GB)'

#endregion

# Data Exploration

In [25]:
# Fetching the data scrapped.
dataset = read_csv('eBay Laptop Description.csv')

In [26]:
# Taking a first look on the data.
dataset

Unnamed: 0.1,Unnamed: 0,href,Title,Price,RAM,Brand,Hdd,Condition,Seller notes,SSD,Rating,Reviews
0,0,https://www.ebay.com/itm/166611579675?epid=132...,Apple Macbook Pro 13.3” 2.5GHz intel Core i5 1...,US $249.00,16 GB,Apple,1 TB,Used: An item that has been used previously. T...,,1 TB,"['4.8', '5.0', '5.0', '5.0']",['I received the MacBook Pro and it was almost...
1,1,https://www.ebay.com/itm/285754711451?itmmeta=...,"Apple MacBook Pro 15"" A1286 2.3GHz Core i7 16G...",US $219.00,16 GB,Apple,256 GB,Used: An item that has been used previously. T...,,240GB,"['4.9', '5.0', '5.0', '5.0']",['Muy buena computadora. En excelentes condici...
2,2,https://www.ebay.com/itm/334577965892?itmmeta=...,Apple Macbook Air 13 (2015) | i5 8GB + 512GB S...,US $246.05/ea,8 GB,Apple,256 GB,Used,“2 YEAR WARRANTY INCLUDED!!! LAPTOP ARE FULLY ...,128-512 GB,"['4.8', '5.0', '4.9', '5.0']",['Opened it as soooon as the mail man brought ...
3,3,https://www.ebay.com/itm/235462992167?itmmeta=...,SONOMA MacBook Pro 15 RETINA / 4.0GHz QUAD COR...,US $665.00/ea,8 GB,Apple,2 TB,Used,“Fully tested and verified! Good condition ove...,2 TB,"['4.9', '5.0', '5.0', '5.0']",['Everything as promised. Fast shipping. Using...
4,4,https://www.ebay.com/p/20029930090?iid=2260712...,"Apple MacBook Pro 13"" (128GB SSD, Intel Core i...",US $36.00,8GB RAM,Apple,256 GB,,,128GB SSD,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
19453,19453,https://www.ebay.com/itm/285812494815?itmmeta=...,iPad air 4th generation 64gb wifi Sky Blue,US $270.00,64 GB,Apple,64 GB,Used: An item that has been used previously. T...,,256 GB,[],"[""Arrived on time and in good condition, and w..."
19454,19454,https://www.ebay.com/itm/335323518419?itmmeta=...,APPLE MACBOOK AIR A1466 I5-3427U @ 1.8GHz 4GB ...,US $34.99,4 GB,Apple,256 GB,For parts or not working,“Please see the pictures of its cosmetic condi...,256 GB,"['4.8', '4.7', '5.0', '5.0']",['Dell 3590 laptop received as described. Comp...
19455,19455,https://www.ebay.com/itm/305231918484?itmmeta=...,Apple iBook G3/366 M6411 | Apple Mac OS 9.0.4 ...,US $399.00,128 MB,Apple,10 GB,Used,"“Tested to power on. Battery charges, but no g...",256 GB,"['5.0', '4.9', '5.0', '5.0']","['Fast shipping, great communication, and exce..."
19456,19456,https://www.ebay.com/itm/134788641380?itmmeta=...,Apple 2023 MacBook Pro 14 inch M3 chip 512GB S...,"C $2,969.99",8 GB,Apple,512 GB,"New: A brand-new, unused, unopened, undamaged ...",,512 GB,"['4.8', '4.8', '4.9', '4.9']","['Got a used RTX 3070, item was a little dirty..."


<font color = '#FFA500'><h3>Observations:</h3></font>
- First two fields are just some form of ID and can be removed.
- The `href` link might be useful for recommending a set of laptops in the future work; but for now, dropping it would be a better choice.
- Can extract missing data of a row using `Title` field.
- The `Price` has to be processed to convert it from string to actual floating figure.
- The `Rating` determines the seller reviews regarding product, which might be helpful to determine public views towards laptop.
- Need to clean `Price`, `RAM`, and `SSD` for better understanding of data by converting them from object to numeric form.

# Data Cleaning

## Missing Data Identification

In [27]:
# Finding the number of missing values in the dataset.
dataset.isna().sum()

Unnamed: 0         0
href               0
Title              0
Price              0
RAM              133
Brand              4
Hdd              177
Condition       2723
Seller notes    8597
SSD              182
Rating             0
Reviews            0
dtype: int64

In [28]:
class DataCleaning:

    def __init__(self, pattern : str) -> None:

        self.pattern = pattern
    
    def clean_data_re(self, string : str) -> float | None:
        
        # Preventing from missing values.
        try:

            # Searching the pattern from the given string.
            result = search(pattern = self.pattern, string = str(object = string))
            
            # If the pattern is found, send it.
            if result:
                return result.group()
            
        except:
            pass
        
        return None
    
    def set_pattern(self, pattern : str) -> None:
        
        self.__init__(pattern = pattern)

## Price -> String to Float.

In [29]:
data_cleaning = DataCleaning(pattern = PATTERN_PRICE)
dataset.Price.apply(data_cleaning.clean_data_re)

0          249.00
1          219.00
2          246.05
3          665.00
4           36.00
           ...   
19453      270.00
19454       34.99
19455      399.00
19456    2,969.99
19457      429.24
Name: Price, Length: 19458, dtype: object

## RAM -> String to Category.

In [33]:
# Changing the pattern for RAM.
data_cleaning.set_pattern(pattern = PATTERN_RAM)

# Cleaning the data.
data = dataset.RAM.apply(data_cleaning.clean_data_re)

# Validating the results stored.
data.value_counts()

Series([], Name: RAM, dtype: int64)

Taking "16" from "16 GB".

In [34]:
# Considering only unit value of the
dataset.RAM = Series(map(lambda d : findall(r'\d+', d)[0],
                         filter(lambda d : d is not None, data)))

# Validating the results stored.
dataset.RAM.value_counts()

Series([], Name: RAM, dtype: int64)

## SSD -> String to Category.