----
# Data Cleaning
----

### Notebook Summary



## Set Up

In [753]:
import numpy as np
import pandas as pd
import re
import matplotlib


In [754]:
df = pd.read_csv('../../data/headphones_data.csv', index_col = 0)


## Utility Functions

In [755]:
def df_check(df):
    '''
    Outputs quality measures for dataframes

    Paramters
    ---------
    df: DataFrame for quality check

    Returns
    -------
    Statements with data quality info such as shape, duplicated values, missing values
    '''
    
    shape = df.shape
    # Calling sum twice - first sum returns column level results second sum to retrun total null values in all columns
    null_vals = df.isna().sum().sum()
    duplicated_rows = df.duplicated().sum()
    duplicated_cols = df.columns.duplicated().sum()

    print (
    f"""
    Data Quality Checks:
    --------------------------------------------
    No. of rows: {shape[0]}
    No. of columns: {shape[1]}
    No. of missing values: {null_vals}
    No. of duplicated rows: {duplicated_rows}
    No. of duplicated columns: {duplicated_cols}
    """
)
    


In [756]:
def get_colour(description):
    '''
    Outputs colour in product description

    Paramters
    ---------
    description: string of product description

    Returns
    -------
    Colour mentioned in the description
    '''
    
    # Using matplotlib to get list of colours (instead of manually creating a list)    
    colour_names = matplotlib.colors.CSS4_COLORS.keys()

    # Looping through the list of colours to see if any of the colours are in the product description 
    for colour in colour_names:
        if re.search(rf'\b{colour}\b', description):
            return colour
        else:
            # if first colour in the list is not found try the next
            continue
    
    # Cases where no colour in the the colour_list is found in description
    return 'Not Specified'
    

## Preliminary Checks

In [757]:
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 300
    No. of columns: 4
    No. of missing values: 1
    No. of duplicated rows: 89
    No. of duplicated columns: 0
    


### Dealing the with the duplicates

In [758]:
duplicated = df[df.duplicated(keep='first')].sort_values(by = 'Price', ascending=False)

In [759]:
# using size to count number of occurrences of each duplicated headphone
duplicated.groupby(['Description', 'Price', 'Rating', 'Is Prime'])[['Description']].size().sort_values(ascending=False)


Description                                                                                                                                                                                               Price   Rating              Is Prime
INFURTURE Active Noise Cancelling Headphones, H1 Wireless Over Ear Bluetooth Headphones, Deep Bass Headset, Low Latency, Memory Foam Ear Cups,40H Playtime, for Adults, Kids, TV, Travel, Home Office     31.99   4.3 out of 5 stars  0           9
Wireless Earbuds, Bluetooth 5.3 Headphones with 4 ENC Noise Canceling Mic, 50H Stereo Dual LED Display Ear Buds, Sport Wireless Earphones with Earhooks, IP7 Waterproof Wireless Headphones for Running   19.99   4.7 out of 5 stars  0           8
TOZO HT2 Hybrid Active Noise Cancelling Wireless Headphones, 60H Playtime Lossless Audio Over Ear Bluetooth Headphones, Hi-Res Audio Deep Bass Foldable Lightweight Headset for Workout                   58.99   4.5 out of 5 stars  1           8
OneOdio Wired Over Ear Headph

In [760]:
# Deciding to drop the duplicates after inspection
df = df.drop_duplicates()

In [761]:
# Re-checking dataframe after dropping the duplicated rows
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 211
    No. of columns: 4
    No. of missing values: 1
    No. of duplicated rows: 0
    No. of duplicated columns: 0
    


### Dealing with Missing Values

In [762]:
df.isna().sum()

Description    0
Price          0
Rating         1
Is Prime       0
dtype: int64

In [763]:
# Viewing the null value in rating
df[df['Rating'].isna()]

Unnamed: 0,Description,Price,Rating,Is Prime
152,"Noise Cancelling Headphones, Wireless over Ear Bluetooth Headphones, With Mic 3-in-1multi-function Headset Folded and Stored Easily Memory Foam Ear Cups for Travel, Home Office #9",9.09,,0


In [764]:
# Dropping the row with the missing rating 
df = df.dropna()

In [765]:
# Re-checking dataframe
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 210
    No. of columns: 4
    No. of missing values: 0
    No. of duplicated rows: 0
    No. of duplicated columns: 0
    


Left with 201 headphones after removing duplicated products, to continue with the cleaning.

## Product Description
----

In [766]:
df['Description']

0                                               Skullcandy Hesh ANC Over-Ear Noise Cancelling Wireless Headphones, 22 Hr Battery, Microphone, Works with iPhone Android and Bluetooth Devices - True Black
1                OneOdio Wired Over Ear Headphones Hi-Fi Sound & Bass Boosted headphone with 50mm Neodymium Drivers and 1/4 to 3.5mm Audio Jack for Studio DJ AMP Recording Monitoring Phones Laptop (Red)
2                                                                                                                                                    Sony MDRZX310L.AE Foldable Headphones - Metallic Blue
3          KVIDIO Bluetooth Headphones Over Ear, 65 Hours Playtime Wireless Headphones with Microphone, Foldable Lightweight Headset with Deep Bass,HiFi Stereo Sound for Travel Work PC Cellphone (Black)
4                                                                                                                                            Sony MDR-ZX110 Overhead Headphones - Black , BA

### Feature Engineering - Wireless

In [767]:
df['Description'] = df['Description'].str.lower()

In [768]:
is_wireless = []
for row in df['Description']:
    x = bool(re.search('wireless',row))
    is_wireless.append(x)


In [769]:
df['Wireless'] = is_wireless

### Feature Engineering - Noise Cancelling

In [770]:
is_noise_cancel = []
for row in df['Description']: 
    x = bool(re.search('noise[-\s]?cancelling',row.lower()))
    is_noise_cancel.append(x)

In [771]:
df['Noise Cancelling'] = is_noise_cancel

In [772]:
df['Noise Cancelling'].value_counts()

Noise Cancelling
False    177
True      33
Name: count, dtype: int64

### Feature Engineering - Colour


In [773]:
df['Description']

0                                               skullcandy hesh anc over-ear noise cancelling wireless headphones, 22 hr battery, microphone, works with iphone android and bluetooth devices - true black
1                oneodio wired over ear headphones hi-fi sound & bass boosted headphone with 50mm neodymium drivers and 1/4 to 3.5mm audio jack for studio dj amp recording monitoring phones laptop (red)
2                                                                                                                                                    sony mdrzx310l.ae foldable headphones - metallic blue
3          kvidio bluetooth headphones over ear, 65 hours playtime wireless headphones with microphone, foldable lightweight headset with deep bass,hifi stereo sound for travel work pc cellphone (black)
4                                                                                                                                            sony mdr-zx110 overhead headphones - black , ba

In [774]:
# Apply the function to each row in the Description column
df['Color'] = df['Description'].apply(get_colour)

In [775]:
df

Unnamed: 0,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Color
0,"skullcandy hesh anc over-ear noise cancelling wireless headphones, 22 hr battery, microphone, works with iphone android and bluetooth devices - true black",79.99,4.5 out of 5 stars,1,True,True,black
1,oneodio wired over ear headphones hi-fi sound & bass boosted headphone with 50mm neodymium drivers and 1/4 to 3.5mm audio jack for studio dj amp recording monitoring phones laptop (red),27.99,4.4 out of 5 stars,0,False,False,red
2,sony mdrzx310l.ae foldable headphones - metallic blue,18.00,4.5 out of 5 stars,0,False,False,blue
3,"kvidio bluetooth headphones over ear, 65 hours playtime wireless headphones with microphone, foldable lightweight headset with deep bass,hifi stereo sound for travel work pc cellphone (black)",16.99,4.5 out of 5 stars,0,True,False,black
4,"sony mdr-zx110 overhead headphones - black , basic, pack of 1",15.97,4.5 out of 5 stars,0,False,False,black
...,...,...,...,...,...,...,...
294,"kids headphones,cat ear bluetooth headphones led light up kids wireless headphones over ear with microphone,foldable bluetooth stereo over-ear kids headsets with mic for phone/laptop/pc/tv",15.38,4.1 out of 5 stars,0,True,False,Not Specified
295,"niurome wireless headphones, kids headphones cat ear headphones, foldable headphones for boys girls, cute led children headphones with mic, safe volume 85/90db bluetooth 5.0 for pc/phone/pad-blue",11.99,2.9 out of 5 stars,0,True,False,blue
296,"s smazinstar usb headset with microphone for pc laptop, professional headphones for adult kids adjustable noise cancelling business office computer headsets for with in-line control (1 set)",11.99,3.8 out of 5 stars,0,False,True,Not Specified
297,"avantree acuear pebble - bluetooth personal sound amplifier with wireless table mic & open-ear headphones 2-in-1 for seniors with clear voice transmission, comfortable fit, and charging case",119.99,3.5 out of 5 stars,1,True,False,Not Specified


### Feature Engineering - Battery Life

In [776]:
def get_battery_life(description):
    regexp = r'(\d+\s*(battery?|hours?|hrs?|h))'
    if re.search(regexp,description):
        return re.search(regexp,description).group()        
    else:
        return 'Not Specified'

In [777]:
df['Battery Life'] = df['Description'].apply(get_battery_life)

In [778]:
df[df['Battery Life']== 'Not Specified'].shape[0]

142

----
**Comment:**

142 rows where battery life is not specified, to have a spot check at a few of these cases to see if regexp needs updating. 

In [779]:
df[df['Battery Life']== 'Not Specified']['Description']

1                 oneodio wired over ear headphones hi-fi sound & bass boosted headphone with 50mm neodymium drivers and 1/4 to 3.5mm audio jack for studio dj amp recording monitoring phones laptop (red)
2                                                                                                                                                     sony mdrzx310l.ae foldable headphones - metallic blue
4                                                                                                                                             sony mdr-zx110 overhead headphones - black , basic, pack of 1
5      roxel rx-90 wired headphones with microphone - lightweight on ear headphones for android/ios devices - comfortable head cushion ergonomic - answer incoming calls - perfect for music lovers (black)
7        oneodio dj headphones, over ear headphones for studio monitoring and mixing, professional headset with stereo bass sound, foldable headphones suitable for electric drum keyboa

-----
**Comment:**

Don't see anything that can be added to regexp pattern, will continue with analysis

### Feature Engineering - Microphone

In [780]:
def has_microphone(description):
    regexp = r'\b(mic?|microphone?)\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [781]:
df['Microphone'] = df['Description'].apply(has_microphone
                                           )

### Feature Engineering: Over Ear

In [782]:
def is_over_ear(description):
    regexp = r'\b(over[\s-]ear?|overhead?)\b'


    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [783]:
df['Over Ear'] = df['Description'].apply(is_over_ear)

In [784]:
df[df['Over Ear'] == 0]

Unnamed: 0,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Color,Battery Life,Microphone,Over Ear
2,sony mdrzx310l.ae foldable headphones - metallic blue,18.00,4.5 out of 5 stars,0,False,False,blue,Not Specified,0,0
5,roxel rx-90 wired headphones with microphone - lightweight on ear headphones for android/ios devices - comfortable head cushion ergonomic - answer incoming calls - perfect for music lovers (black),12.99,4.3 out of 5 stars,0,False,False,black,Not Specified,1,0
6,"wireless earbuds, bluetooth 5.3 headphones with 4 enc noise canceling mic, 50h stereo dual led display ear buds, sport wireless earphones with earhooks, ip7 waterproof wireless headphones for running",19.99,4.7 out of 5 stars,0,True,False,Not Specified,3 h,1,0
11,"sony zx310ap on-ear headphones compatible with smartphones, tablets and mp3 devices - metallic black",18.99,4.4 out of 5 stars,0,False,False,black,Not Specified,0,0
14,"soundcore by anker q30 hybrid active noise cancelling headphones with multiple modes, hi-res sound, custom eq via app, 40h playtime, comfortable fit, bluetooth headphones, multipoint connection",49.00,4.6 out of 5 stars,1,False,True,Not Specified,30 h,0,0
...,...,...,...,...,...,...,...,...,...,...
284,"yomuse c89 kids headphones, wired headphone with microphone, on ear headphone with adjustable, foldable headphones for school travel children girls boys adults (camo grey)",11.99,4.2 out of 5 stars,0,False,False,grey,Not Specified,1,0
286,"riwbox wt-7s kids bluetooth headphones light up, foldable stero wireless headset with microphone and volume control for pc/tablet/tv/travel",24.98,4.6 out of 5 stars,0,True,False,Not Specified,Not Specified,1,0
295,"niurome wireless headphones, kids headphones cat ear headphones, foldable headphones for boys girls, cute led children headphones with mic, safe volume 85/90db bluetooth 5.0 for pc/phone/pad-blue",11.99,2.9 out of 5 stars,0,True,False,blue,Not Specified,1,0
296,"s smazinstar usb headset with microphone for pc laptop, professional headphones for adult kids adjustable noise cancelling business office computer headsets for with in-line control (1 set)",11.99,3.8 out of 5 stars,0,False,True,Not Specified,Not Specified,1,0


### Feature Engineering : Foldable 

In [785]:
def is_foldable(description):
    regexp = r'\bfoldable\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [786]:
df['Foldable'] = df['Description'].apply(is_foldable)

## Final Clean Up
---

In [787]:
df.head(10)

Unnamed: 0,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Color,Battery Life,Microphone,Over Ear,Foldable
0,"skullcandy hesh anc over-ear noise cancelling wireless headphones, 22 hr battery, microphone, works with iphone android and bluetooth devices - true black",79.99,4.5 out of 5 stars,1,True,True,black,22 hr,1,1,0
1,oneodio wired over ear headphones hi-fi sound & bass boosted headphone with 50mm neodymium drivers and 1/4 to 3.5mm audio jack for studio dj amp recording monitoring phones laptop (red),27.99,4.4 out of 5 stars,0,False,False,red,Not Specified,0,1,0
2,sony mdrzx310l.ae foldable headphones - metallic blue,18.0,4.5 out of 5 stars,0,False,False,blue,Not Specified,0,0,1
3,"kvidio bluetooth headphones over ear, 65 hours playtime wireless headphones with microphone, foldable lightweight headset with deep bass,hifi stereo sound for travel work pc cellphone (black)",16.99,4.5 out of 5 stars,0,True,False,black,65 hours,1,1,1
4,"sony mdr-zx110 overhead headphones - black , basic, pack of 1",15.97,4.5 out of 5 stars,0,False,False,black,Not Specified,0,1,0
5,roxel rx-90 wired headphones with microphone - lightweight on ear headphones for android/ios devices - comfortable head cushion ergonomic - answer incoming calls - perfect for music lovers (black),12.99,4.3 out of 5 stars,0,False,False,black,Not Specified,1,0,0
6,"wireless earbuds, bluetooth 5.3 headphones with 4 enc noise canceling mic, 50h stereo dual led display ear buds, sport wireless earphones with earhooks, ip7 waterproof wireless headphones for running",19.99,4.7 out of 5 stars,0,True,False,Not Specified,3 h,1,0,0
7,"oneodio dj headphones, over ear headphones for studio monitoring and mixing, professional headset with stereo bass sound, foldable headphones suitable for electric drum keyboard guitar amplifier",30.58,4.4 out of 5 stars,0,False,False,Not Specified,Not Specified,0,1,1
8,"iclever hs18 over ear headphones with microphone - lightweight stereo headphones, adjustable foldable wired headphones with 3.5mm jack for online class/meeting/pc/phone/computer",16.99,4.5 out of 5 stars,0,False,False,Not Specified,Not Specified,1,1,1
9,"headphones wired over ear adult, stereo hifi music headphone foldable compact wired headset",8.99,3.6 out of 5 stars,0,False,False,Not Specified,Not Specified,0,1,1


### Rating

To remove `out of 5 stars` as this information is redundant information.

In [788]:
df['Rating'] = df['Rating'].astype('str')

In [789]:
df['Rating'] = df['Rating'].str.replace('out of 5 stars', '')

In [790]:
df['Rating'] = df['Rating'].astype(float)

## Export to CSV
----

In [791]:
cleaned_df = df.copy()

In [752]:
cleaned_df.to_csv('../../data/cleaned_headphones_data.csv')