----
# Data Cleaning
----

### Notebook Summary



## Set Up

In [125]:
import numpy as np
import pandas as pd
import re
import matplotlib


In [126]:
df = pd.read_csv('../../data/headphones_data.csv', index_col = 0)


## Utility Functions

In [127]:
def df_check(df):
    '''
    Outputs quality measures for dataframes

    Paramters
    ---------
    df: DataFrame for quality check

    Returns
    -------
    Statements with data quality info such as shape, duplicated values, missing values
    '''
    
    shape = df.shape
    # Calling sum twice - first sum returns column level results second sum to retrun total null values in all columns
    null_vals = df.isna().sum().sum()
    duplicated_rows = df.duplicated().sum()
    duplicated_cols = df.columns.duplicated().sum()

    print (
    f"""
    Data Quality Checks:
    --------------------------------------------
    No. of rows: {shape[0]}
    No. of columns: {shape[1]}
    No. of missing values: {null_vals}
    No. of duplicated rows: {duplicated_rows}
    No. of duplicated columns: {duplicated_cols}
    """
)
    


In [128]:
def get_colour(description):
    '''
    Outputs colour in product description

    Paramters
    ---------
    description: string of product description

    Returns
    -------
    Colour mentioned in the description
    '''
    
    # Using matplotlib to get list of colours (instead of manually creating a list)    
    colour_names = matplotlib.colors.CSS4_COLORS.keys()

    # Looping through the list of colours to see if any of the colours are in the product description 
    for colour in colour_names:
        if re.search(rf'\b{colour}\b', description):
            return colour
        else:
            # if first colour in the list is not found try the next
            continue
    
    # Cases where no colour in the the colour_list is found in description
    return 'Not Specified'
    

## Preliminary Checks

In [129]:
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 750
    No. of columns: 4
    No. of missing values: 118
    No. of duplicated rows: 196
    No. of duplicated columns: 0
    


### Dealing the with the duplicates

In [130]:
duplicated = df[df.duplicated(keep='first')].sort_values(by = 'Price', ascending=False)

In [131]:
# using size to count number of occurrences of each duplicated headphone
duplicated.groupby(['Description', 'Price', 'Rating', 'Is Prime'])[['Description']].size().sort_values(ascending=False)


Description                                                                                                                                                                                               Price   Rating              Is Prime
INFURTURE Active Noise Cancelling Headphones, H1 Wireless Over Ear Bluetooth Headphones, Deep Bass Headset, Low Latency, Memory Foam Ear Cups,40H Playtime, for Adults, Kids, TV, Travel, Home Office     31.99   4.3 out of 5 stars  0           20
TOZO HT2 Hybrid Active Noise Cancelling Wireless Headphones, 60H Playtime Lossless Audio Over Ear Bluetooth Headphones, Hi-Res Audio Deep Bass Foldable Lightweight Headset for Workout                   58.99   4.5 out of 5 stars  1           17
RUNOLIM Hybrid Active Noise Cancelling Headphones, Wireless Over Ear Bluetooth Headphones with Microphone, 70H Playtime, Foldable Wireless Headphones with HiFi Audio, Deep Bass for Home Travel Office   22.09   4.4 out of 5 stars  0           16
ISOtunes PRO 2.0 EN352 Blu

In [132]:
# Deciding to drop the duplicates after inspection
df = df.drop_duplicates()

In [133]:
# Re-checking dataframe after dropping the duplicated rows
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 554
    No. of columns: 4
    No. of missing values: 112
    No. of duplicated rows: 0
    No. of duplicated columns: 0
    


### Dealing with Missing Values

In [134]:
df.isna().sum()

Description      0
Price            0
Rating         112
Is Prime         0
dtype: int64

In [135]:
# Viewing the null value in rating
df[df['Rating'].isna()]

Unnamed: 0,Description,Price,Rating,Is Prime
147,"Noise Cancelling Headphones, Wireless over Ear...",9.09,,0
254,"PCKOBEVER Bluetooth Wireless Headphones,Cute C...",10.99,,0
309,"Bewinner Cute Cat Ear Gaming Headphones, LED L...",36.25,,1
400,"Wireless Headphones for TV Watching, FM Wirele...",29.98,,0
413,"Gedourain Cat Ears Bluetooth Headset, Stereo H...",22.89,,0
...,...,...,...,...
743,"2 In 1 3D VR Headset Headphones, Adjustable Ki...",39.09,,0
744,"HIJIN Foldable Headphones with Microphone, Wir...",39.29,,0
745,"HIJIN Foldable Headphones with Microphone, Wir...",39.29,,0
746,"HIJIN Foldable Headphones with Microphone, Wir...",39.29,,0


In [136]:
# Dropping the row with the missing rating 
df = df.dropna()

In [137]:
# Re-checking dataframe
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 442
    No. of columns: 4
    No. of missing values: 0
    No. of duplicated rows: 0
    No. of duplicated columns: 0
    


Left with 201 headphones after removing duplicated products, to continue with the cleaning.

## Product Description
----

In [138]:
df['Description']

0      Ailihen C8 Headphones Wired with Microphone Fo...
1      OneOdio Wired Over Ear Headphones Hi-Fi Sound ...
2      Sony MDRZX310L.AE Foldable Headphones - Metall...
3      Sony MDR-ZX110 Overhead Headphones - Black , B...
4      KVIDIO Bluetooth Headphones Over Ear, 65 Hours...
                             ...                        
718    MEE audio Connect Bluetooth Wireless Headphone...
731    ciciglow Portable Pillow Speaker, Mini Sleepin...
740    Yunsailing 100 Packs Earbuds Bulk Classroom He...
748    ciciglow Portable Pillow Speaker, Mini Sleepin...
749    Philips TAT2236GR/00 Wireless Earbuds, Adults ...
Name: Description, Length: 442, dtype: object

### Feature Engineering - Wireless

In [139]:
df['Description'] = df['Description'].str.lower()

In [140]:
is_wireless = []
for row in df['Description']:
    x = bool(re.search('wireless',row))
    is_wireless.append(x)


In [141]:
df['Wireless'] = is_wireless

### Feature Engineering - Noise Cancelling

In [142]:
is_noise_cancel = []
for row in df['Description']: 
    x = bool(re.search('noise[-\s]?cancelling',row.lower()))
    is_noise_cancel.append(x)

In [143]:
df['Noise Cancelling'] = is_noise_cancel

In [144]:
df['Noise Cancelling'].value_counts()

Noise Cancelling
False    376
True      66
Name: count, dtype: int64

### Feature Engineering - Colour


In [145]:
df['Description']

0      ailihen c8 headphones wired with microphone fo...
1      oneodio wired over ear headphones hi-fi sound ...
2      sony mdrzx310l.ae foldable headphones - metall...
3      sony mdr-zx110 overhead headphones - black , b...
4      kvidio bluetooth headphones over ear, 65 hours...
                             ...                        
718    mee audio connect bluetooth wireless headphone...
731    ciciglow portable pillow speaker, mini sleepin...
740    yunsailing 100 packs earbuds bulk classroom he...
748    ciciglow portable pillow speaker, mini sleepin...
749    philips tat2236gr/00 wireless earbuds, adults ...
Name: Description, Length: 442, dtype: object

In [146]:
# Apply the function to each row in the Description column
df['Color'] = df['Description'].apply(get_colour)

In [147]:
df

Unnamed: 0,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Color
0,ailihen c8 headphones wired with microphone fo...,19.99,4.3 out of 5 stars,0,False,False,black
1,oneodio wired over ear headphones hi-fi sound ...,27.99,4.4 out of 5 stars,0,False,False,red
2,sony mdrzx310l.ae foldable headphones - metall...,18.00,4.5 out of 5 stars,0,False,False,blue
3,"sony mdr-zx110 overhead headphones - black , b...",15.97,4.5 out of 5 stars,0,False,False,black
4,"kvidio bluetooth headphones over ear, 65 hours...",16.99,4.5 out of 5 stars,0,True,False,black
...,...,...,...,...,...,...,...
718,mee audio connect bluetooth wireless headphone...,99.99,4.3 out of 5 stars,0,True,False,Not Specified
731,"ciciglow portable pillow speaker, mini sleepin...",18.08,1.0 out of 5 stars,0,True,False,black
740,yunsailing 100 packs earbuds bulk classroom he...,39.23,4.7 out of 5 stars,0,False,False,blue
748,"ciciglow portable pillow speaker, mini sleepin...",15.61,1.0 out of 5 stars,0,True,False,white


### Feature Engineering - Battery Life

In [213]:
def get_battery_life(description):
    regexp = r'((\d+)\s*(battery?|hours?|hrs?|h))'
    if re.search(regexp,description):
        # using .group to only get the int part of the regexp
        return re.search(regexp,description).group(2)    
        
    else:
        return 'Not Specified'

In [214]:
df['Battery Life'] = df['Description'].apply(get_battery_life)

In [215]:
df[df['Battery Life']== 'Not Specified'].shape[0]

317

----
**Comment:**

317 rows where battery life is not specified, to have a spot check at a few of these cases to see if regexp needs updating. 

In [218]:
pd.set_option('display.max_colwidth', None)  # Show full content of each cell

In [219]:
df[df['Battery Life']== 'Not Specified']['Description']

1                     oneodio wired over ear headphones hi-fi sound & bass boosted headphone with 50mm neodymium drivers and 1/4 to 3.5mm audio jack for studio dj amp recording monitoring phones laptop (red)
2                                                                                                                                                         sony mdrzx310l.ae foldable headphones - metallic blue
3                                                                                                                                                 sony mdr-zx110 overhead headphones - black , basic, pack of 1
5                             iclever hs18 over ear headphones with microphone - lightweight stereo headphones, adjustable foldable wired headphones with 3.5mm jack for online class/meeting/pc/phone/computer
6      artix cl750 wired headphones with mic & volume control — 90% noise cancelling headphones wired, over ear head phone cable — foldable plug in headphones for lapto

-----
**Comment:**

Don't see anything that can be added to regexp pattern, will continue with analysis

### Feature Engineering - Microphone

In [152]:
def has_microphone(description):
    regexp = r'\b(mic?|microphone?)\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [153]:
df['Microphone'] = df['Description'].apply(has_microphone
                                           )

### Feature Engineering: Over Ear

In [154]:
def is_over_ear(description):
    regexp = r'\b(over[\s-]ear?|overhead?)\b'


    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [155]:
df['Over Ear'] = df['Description'].apply(is_over_ear)

In [156]:
df[df['Over Ear'] == 0]

Unnamed: 0,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Color,Battery Life,Microphone,Over Ear
0,ailihen c8 headphones wired with microphone fo...,19.99,4.3 out of 5 stars,0,False,False,black,8 h,1,0
2,sony mdrzx310l.ae foldable headphones - metall...,18.00,4.5 out of 5 stars,0,False,False,blue,Not Specified,0,0
8,roxel rx-90 wired headphones with microphone -...,12.99,4.3 out of 5 stars,0,False,False,black,Not Specified,1,0
11,sony zx310ap on-ear headphones compatible with...,18.99,4.4 out of 5 stars,0,False,False,black,Not Specified,0,0
14,soundcore by anker q30 hybrid active noise can...,49.00,4.6 out of 5 stars,1,False,True,Not Specified,30 h,0,0
...,...,...,...,...,...,...,...,...,...,...
718,mee audio connect bluetooth wireless headphone...,99.99,4.3 out of 5 stars,0,True,False,Not Specified,Not Specified,0,0
731,"ciciglow portable pillow speaker, mini sleepin...",18.08,1.0 out of 5 stars,0,True,False,black,Not Specified,0,0
740,yunsailing 100 packs earbuds bulk classroom he...,39.23,4.7 out of 5 stars,0,False,False,blue,Not Specified,0,0
748,"ciciglow portable pillow speaker, mini sleepin...",15.61,1.0 out of 5 stars,0,True,False,white,Not Specified,0,0


### Feature Engineering : Foldable 

In [157]:
def is_foldable(description):
    regexp = r'\bfoldable\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [158]:
df['Foldable'] = df['Description'].apply(is_foldable)

### Feature Engineering : Brand

In [159]:
import spacy

In [160]:
def get_brand(description):
    # Load a pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Process the text
    doc = nlp(description)
    # Extract named entities
    for ent in doc.ents:
        if ent.label_ == "ORG":  # ORG for organisations/brands
            return ent.text
        else:
            return 'Unknown Brand'

In [161]:
df['Brand'] = df['Description'].apply(get_brand)

In [162]:
df

Unnamed: 0,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Color,Battery Life,Microphone,Over Ear,Foldable,Brand
0,ailihen c8 headphones wired with microphone fo...,19.99,4.3 out of 5 stars,0,False,False,black,8 h,1,0,1,c8
1,oneodio wired over ear headphones hi-fi sound ...,27.99,4.4 out of 5 stars,0,False,False,red,Not Specified,0,1,0,Unknown Brand
2,sony mdrzx310l.ae foldable headphones - metall...,18.00,4.5 out of 5 stars,0,False,False,blue,Not Specified,0,0,1,sony
3,"sony mdr-zx110 overhead headphones - black , b...",15.97,4.5 out of 5 stars,0,False,False,black,Not Specified,0,1,0,sony
4,"kvidio bluetooth headphones over ear, 65 hours...",16.99,4.5 out of 5 stars,0,True,False,black,65 hours,1,1,1,Unknown Brand
...,...,...,...,...,...,...,...,...,...,...,...,...
718,mee audio connect bluetooth wireless headphone...,99.99,4.3 out of 5 stars,0,True,False,Not Specified,Not Specified,0,0,0,Unknown Brand
731,"ciciglow portable pillow speaker, mini sleepin...",18.08,1.0 out of 5 stars,0,True,False,black,Not Specified,0,0,0,Unknown Brand
740,yunsailing 100 packs earbuds bulk classroom he...,39.23,4.7 out of 5 stars,0,False,False,blue,Not Specified,0,0,0,Unknown Brand
748,"ciciglow portable pillow speaker, mini sleepin...",15.61,1.0 out of 5 stars,0,True,False,white,Not Specified,0,0,0,Unknown Brand


In [163]:
df['Brand'].value_counts()

Brand
Unknown Brand                       273
sony                                  6
rgb                                   6
c8                                    4
doqaus bluetooth headphones over      3
jyps kids wireless                    3
konnao kids                           2
osszit kids headphones                2
samsung                               2
louise & mann                         2
jyps                                  2
betron                                2
android                               2
radio & wired                         2
doqaus bluetooth headphones           2
usb                                   1
tv & pc                               1
microphone & control black            1
philips tat8506wt                     1
panasonic                             1
670nc                                 1
microphone & volume limited           1
xosda kids                            1
xosda bulk                            1
200h                              

-----
**Comment:**

My attempt at using SpaCy to extract the brand names from Product Descrption did not work too well. 

Majority of the data set containes brand names which are unknown and the ones it managed to find - most are not a brand and are common words in description like wireless/bluetooth or a colors. Therefore, I will drop this column from the dataframe.

In [170]:
df = df.drop(columns = ['Brand'], axis = 0)

## Final Clean Up
---

In [172]:
df.head(10)

Unnamed: 0,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Color,Battery Life,Microphone,Over Ear,Foldable
0,ailihen c8 headphones wired with microphone fo...,19.99,4.3,0,False,False,black,8 h,1,0,1
1,oneodio wired over ear headphones hi-fi sound ...,27.99,4.4,0,False,False,red,Not Specified,0,1,0
2,sony mdrzx310l.ae foldable headphones - metall...,18.0,4.5,0,False,False,blue,Not Specified,0,0,1
3,"sony mdr-zx110 overhead headphones - black , b...",15.97,4.5,0,False,False,black,Not Specified,0,1,0
4,"kvidio bluetooth headphones over ear, 65 hours...",16.99,4.5,0,True,False,black,65 hours,1,1,1
5,iclever hs18 over ear headphones with micropho...,16.99,4.5,0,False,False,Not Specified,Not Specified,1,1,1
6,artix cl750 wired headphones with mic & volume...,21.3,4.2,0,False,True,Not Specified,Not Specified,1,1,1
7,"rebocico noise cancelling headphones, 80h play...",29.99,4.2,0,True,True,silver,80h,1,1,0
8,roxel rx-90 wired headphones with microphone -...,12.99,4.3,0,False,False,black,Not Specified,1,0,0
9,"headphones wired over ear adult, stereo hifi m...",8.99,3.6,0,False,False,Not Specified,Not Specified,0,1,1


### Rating

To remove `out of 5 stars` as this information is redundant information.

In [220]:
df['Rating'] = df['Rating'].astype('str')

In [221]:
df['Rating'] = df['Rating'].str.replace('out of 5 stars', '')

In [222]:
df['Rating'] = df['Rating'].astype(float)

## Export to CSV
----

In [232]:
cleaned_df = df.copy()

In [233]:
cleaned_df = cleaned_df.reset_index(drop = True)

In [234]:
cleaned_df.to_csv('../../data/cleaned_headphones_data.csv')

In [235]:
df_check(cleaned_df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 442
    No. of columns: 11
    No. of missing values: 0
    No. of duplicated rows: 0
    No. of duplicated columns: 0
    


## Conclusion
-------

