----
# Data Cleaning
----

### Notebook Summary



## Set Up

In [64]:
import numpy as np
import pandas as pd
import re
import matplotlib


In [65]:
df = pd.read_csv('../../data/headphones_data.csv', index_col = 0)


## Utility Functions

In [66]:
def df_check(df):
    '''
    Outputs quality measures for dataframes

    Paramters
    ---------
    df: DataFrame for quality check

    Returns
    -------
    Statements with data quality info such as shape, duplicated values, missing values
    '''
    
    shape = df.shape
    # Calling sum twice - first sum returns column level results second sum to retrun total null values in all columns
    null_vals = df.isna().sum().sum()
    duplicated_rows = df.duplicated().sum()
    duplicated_cols = df.columns.duplicated().sum()

    print (
    f"""
    Data Quality Checks:
    --------------------------------------------
    No. of rows: {shape[0]}
    No. of columns: {shape[1]}
    No. of missing values: {null_vals}
    No. of duplicated rows: {duplicated_rows}
    No. of duplicated columns: {duplicated_cols}
    """
)
    


In [67]:
def get_colour(description):
    '''
    Outputs colour in product description

    Paramters
    ---------
    description: string of product description

    Returns
    -------
    Colour mentioned in the description
    '''
    
    # Using matplotlib to get list of colours (instead of manually creating a list)    
    colour_names = matplotlib.colors.CSS4_COLORS.keys()

    # Looping through the list of colours to see if any of the colours are in the product description 
    for colour in colour_names:
        if re.search(rf'\b{colour}\b', description):
            return colour
        else:
            # if first colour in the list is not found try the next
            continue
    
    # Cases where no colour in the the colour_list is found in description
    return 'Not Specified'
    

## Preliminary Checks

In [68]:
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 1500
    No. of columns: 5
    No. of missing values: 695
    No. of duplicated rows: 365
    No. of duplicated columns: 0
    


### Dealing the with the duplicates

In [69]:
duplicated = df[df.duplicated(keep='first')].sort_values(by = 'Price', ascending=False)

In [70]:
# using size to count number of occurrences of each duplicated headphone
duplicated.groupby(['Description', 'Price', 'Rating', 'Is Prime'])[['Description']].size().sort_values(ascending=False)


Description                                                                                                                                                                                               Price          Rating              Is Prime
Oladance Open Ear Headphones Bluetooth 5.2 Wireless Earbuds for Android & iPhone, Open Ear Earbuds with Dual 16.5mm Dynamic Drivers, Up To 16 Hours Playtime Waterproof Sport Earbuds -Interstellar Blue  149.99         4.3 out of 5 stars  1           44
Technics EAH-A800E-S Wireless Headphones, Over Ear Multipoint Bluetooth Earphones With Noise Cancelling and Microphone, Ergonomic Fit, Up to 50 Hours Playtime, Easy Connection, Folding Design, Silver   249.00         4.2 out of 5 stars  0           41
Wantek Headsets with Microphone 2.5mm Noise Cancelling Headphones Microphone, Wired Computer Telephone Headset for PC Laptop, Call Centre, Video Conferencing                                             15.99          4.1 out of 5 stars  0           2

In [71]:
# Deciding to drop the duplicates after inspection
df = df.drop_duplicates()

In [72]:
# Re-checking dataframe after dropping the duplicated rows
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 1135
    No. of columns: 5
    No. of missing values: 640
    No. of duplicated rows: 0
    No. of duplicated columns: 0
    


### Dealing with Missing Values

In [73]:
df.isna().sum()

Product ID       0
Description      0
Price            0
Rating         640
Is Prime         0
dtype: int64

In [74]:
# Viewing the null value in rating
df[df['Rating'].isna()]

Unnamed: 0,Product ID,Description,Price,Rating,Is Prime
178,B0CW9FL8BT,"Noise Cancelling Headphones, Wireless over Ear Bluetooth Headphones, With Mic 3-in-1multi-function Headset Folded and Stored Easily Memory Foam Ear Cups for Travel, Home Office #9",9.09,,0
274,B0D7ZPP1LZ,"CN-Outlet Bulk Earbud Headphones 100 Pack for Classroom Kids,Wholeslae Wired Durable Earphones Class Set for Students Children Toddler Teens Boys Girls and Adult(CN102,Black)",516.82,,0
284,B0CHMWYVTP,"NCRD Quiet Comfort Wireless Bluetooth Headphones, Noise-Cancelling, Wireless Bluetooth Headphones, Deep Bass, Hi-Fi Sound, 20H Playtime Headset for Adults, TV, Online Class, Home Office",371.52,,0
308,B0D89ZZK43,"PCKOBEVER Bluetooth Wireless Headphones,Cute Cat Ear Earphone For Kids,Over Ear Headsets Foldable Stereo Headphones LED Light Up,Wireless Headphones With Microphone For Kids Girls Boys(Pink)",10.99,,0
309,B0BNWYLGCS,"Bewinner Cute Cat Ear Gaming Headphones, LED Lights 3.5mm Wired Wireless BT Foldable Gaming Headset for PC, Laptop Headset with Noise Canceling Microphone for Gift, Game (Green)",36.90,,1
...,...,...,...,...,...
1494,B0CJ8B1BBK,"Air Conduction Headphones for Kids and Adults Open Ear Sports Bluetooth 5.0 Headphones, HiFi Stereo Sound Sweatproof Headset for Running, Workout",8.12,,0
1496,B0CCPB4S8F,"Cat Ear Headphones, Cute Wireless Headsets, HiFi Sound Headset With Light, Over-Ear Adjustable Stereo, Comfortable Folding Headphones With Microphone For Adults And Kids",19.36,,0
1497,B0B15N122V,"Headphones Headset with Deep Bass Stereo Sound, Comfortable Earpads for Adults, Kids, TV, Travel, Home Office (Gold)",20.97,,0
1498,B0CM3PWX8B,"Noise Cancelling Headphones, Bluetooth Headphone Wireless Active Noise Cancelling Headset Head-Worn Deep Bass Binaural Stereo Earphone Over Ear Foldable Earbuds for Phone Laptop Game Office Sport (Co",Not Specified,,0


In [75]:
# Dropping the row with the missing rating 
df = df.dropna()

In [76]:
# Re-checking dataframe
df_check(df)


    Data Quality Checks:
    --------------------------------------------
    No. of rows: 495
    No. of columns: 5
    No. of missing values: 0
    No. of duplicated rows: 0
    No. of duplicated columns: 0
    


Left with 201 headphones after removing duplicated products, to continue with the cleaning.

### Dealing with Not Specified 

When scraping if information was not present I added a placeholder 'NOT SPECIFIED' I would have to get rid of this in certain columns where I need all info: Price

In [77]:
df.columns

Index(['Product ID', 'Description', 'Price', 'Rating', 'Is Prime'], dtype='object')

In [78]:
df['Price'].value_counts()

Price
19.99            32
Not Specified    23
15.99            17
17.99            16
24.99            16
                 ..
14.00             1
15.00             1
32.59             1
69.71             1
20.16             1
Name: count, Length: 244, dtype: int64

In [79]:
df = df[df['Price'] != 'Not Specified']

In [80]:
df['Price'].value_counts()

Price
19.99    32
15.99    17
24.99    16
17.99    16
14.99    14
         ..
14.00     1
15.00     1
32.59     1
69.71     1
20.16     1
Name: count, Length: 243, dtype: int64

## Product Description
----

In [81]:
df['Description']

0         INFURTURE Active Noise Cancelling Headphones, H1 Wireless Over Ear Bluetooth Headphones, Deep Bass Headset, Low Latency, Memory Foam Ear Cups,40H Playtime, for Adults, Kids, TV, Travel, Home Office
1       RUNOLIM Hybrid Active Noise Cancelling Headphones, Wireless Over Ear Bluetooth Headphones with Microphone, 70H Playtime, Foldable Wireless Headphones with HiFi Audio, Deep Bass for Home Travel Office
2                                                                                                                                                         Sony MDRZX310L.AE Foldable Headphones - Metallic Blue
3                                                                                                                                                 Sony MDR-ZX110 Overhead Headphones - Black , BASIC, Pack of 1
4                     LORELEI X6 Over-Ear Headphones with Microphone, Lightweight Foldable & Portable Stereo Bass Headphones with 1.45M No-Tangle, Wired Headphones for 

### Feature Engineering - Wireless

In [82]:
df['Description'] = df['Description'].str.lower()

In [83]:
def is_wireless(description):
    regexp = r'\bwireless\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [84]:
df['Wireless'] = df['Description'].apply(is_wireless)

### Feature Engineering - Noise Cancelling

In [85]:
def is_noise_cancel(description):
    regexp = r'\bnoise[-\s]?cancelling\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [86]:
df['Noise Cancelling'] = df['Description'].apply(is_noise_cancel)

### Feature Engineering - Colour


In [87]:
df['Description']

0         infurture active noise cancelling headphones, h1 wireless over ear bluetooth headphones, deep bass headset, low latency, memory foam ear cups,40h playtime, for adults, kids, tv, travel, home office
1       runolim hybrid active noise cancelling headphones, wireless over ear bluetooth headphones with microphone, 70h playtime, foldable wireless headphones with hifi audio, deep bass for home travel office
2                                                                                                                                                         sony mdrzx310l.ae foldable headphones - metallic blue
3                                                                                                                                                 sony mdr-zx110 overhead headphones - black , basic, pack of 1
4                     lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle, wired headphones for 

In [88]:
# Apply the function to each row in the Description column
df['Colour'] = df['Description'].apply(get_colour)

In [89]:
df['Colour'].value_counts()

Colour
Not Specified    190
black            110
blue              41
pink              31
green             20
white             16
gold              16
purple            11
red               10
grey               8
silver             4
orange             4
gray               3
beige              2
yellow             2
ivory              1
navy               1
violet             1
cyan               1
Name: count, dtype: int64

In [90]:
df['Colour'] = df['Colour'].replace('gray', 'grey')

In [91]:
df

Unnamed: 0,Product ID,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Colour
0,B08HDBZNZ9,"infurture active noise cancelling headphones, h1 wireless over ear bluetooth headphones, deep bass headset, low latency, memory foam ear cups,40h playtime, for adults, kids, tv, travel, home office",49.99,4.3 out of 5 stars,1,1,1,Not Specified
1,B0C8SJSL9H,"runolim hybrid active noise cancelling headphones, wireless over ear bluetooth headphones with microphone, 70h playtime, foldable wireless headphones with hifi audio, deep bass for home travel office",25.99,4.4 out of 5 stars,0,1,1,Not Specified
2,B00I3LUYNG,sony mdrzx310l.ae foldable headphones - metallic blue,13.00,4.5 out of 5 stars,0,0,0,blue
3,B00NBR70DO,"sony mdr-zx110 overhead headphones - black , basic, pack of 1",14.99,4.5 out of 5 stars,0,0,0,black
4,B083P1HG9S,"lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle, wired headphones for smartphone tablet mp3 / 4 (space black)",15.99,4.4 out of 5 stars,0,0,0,black
...,...,...,...,...,...,...,...,...
1337,B0836MR393,"usb headset with microphone noise cancelling & audio controls, wideband pc headphone for business uc skype lync softphone call center office computer, clearer voice, super light, ultra comfort",28.95,4.2 out of 5 stars,0,0,1,Not Specified
1393,B088D48HZV,"lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle,wired headphones for smartphone tablet mp3 / 4 (vitality orange)",21.24,4.4 out of 5 stars,1,0,0,orange
1429,B0CBTZCXK1,"qaekie bone conduction headphones - bluetooth 5.3 open ear headphones with hd mic,12hrs playtime deep bass sport wireless headphones,sweatproof bone headphones for running,cycling,hiking,driving",67.29,3.9 out of 5 stars,0,1,0,Not Specified
1493,B00DSTBMOS,jvc haf160g gumy ear bud headphone green,20.16,4.4 out of 5 stars,0,0,0,green


### Feature Engineering - Battery Life

In [92]:
def get_battery_life(description):
    regexp = r'(\b[1-9]\d*)\s*(battery|batteries|hours?|hrs?|h)'
    if re.search(regexp,description):
        # using .group to only get the int part of the regexp
        return re.search(regexp,description).group(1)    
        
    else:
        return 'Not Specified'

In [93]:
df['Battery Life'] = df['Description'].apply(get_battery_life)

----
**Comment:**

317 rows where battery life is not specified, to have a spot check at a few of these cases to see if regexp needs updating. 

In [94]:
pd.set_option('display.max_colwidth', None)  # Show full content of each cell

In [95]:
df[df['Battery Life']== 'Not Specified']['Description']

2                                                                                                                                                      sony mdrzx310l.ae foldable headphones - metallic blue
3                                                                                                                                              sony mdr-zx110 overhead headphones - black , basic, pack of 1
4                  lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle, wired headphones for smartphone tablet mp3 / 4 (space black)
7         oneodio dj headphones, over ear headphones for studio monitoring and mixing, professional headset with stereo bass sound, foldable headphones suitable for electric drum keyboard guitar amplifier
8       roxel rx-90 wired headphones with microphone - lightweight on ear headphones for android/ios devices - comfortable head cushion ergonomic - answer incoming calls - perfect 

-----
**Comment:**

Don't see anything that can be added to regexp pattern, will continue with analysis

### Feature Engineering - Microphone

In [97]:
def has_microphone(description):
    regexp = r'\b(mic?|microphone?)\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [98]:
df['Microphone'] = df['Description'].apply(has_microphone
                                           )

### Feature Engineering: Over Ear

In [99]:
def is_over_ear(description):
    regexp = r'\b(over[\s-]ear?|overhead?)\b'


    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [100]:
df['Over Ear'] = df['Description'].apply(is_over_ear)

In [101]:
df[df['Over Ear'] == 0]

Unnamed: 0,Product ID,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Colour,Battery Life,Microphone,Over Ear
2,B00I3LUYNG,sony mdrzx310l.ae foldable headphones - metallic blue,13.00,4.5 out of 5 stars,0,0,0,blue,Not Specified,0,0
6,B0CZ6T3BWM,"wireless earbuds, bluetooth 5.3 headphones with 4 enc noise canceling mic, 50h stereo dual led display ear buds, sport wireless earphones with earhooks, ip7 waterproof wireless headphones for running",19.99,4.7 out of 5 stars,0,1,0,Not Specified,3,1,0
8,B0C8V45ZF5,roxel rx-90 wired headphones with microphone - lightweight on ear headphones for android/ios devices - comfortable head cushion ergonomic - answer incoming calls - perfect for music lovers (black),12.99,4.3 out of 5 stars,0,0,0,black,Not Specified,1,0
11,B00I3LV336,"sony zx310ap on-ear headphones compatible with smartphones, tablets and mp3 devices - metallic black",18.99,4.4 out of 5 stars,0,0,0,black,Not Specified,0,0
15,B08HMWZBXC,"soundcore by anker q30 hybrid active noise cancelling headphones with multiple modes, hi-res sound, custom eq via app, 40h playtime, comfortable fit, bluetooth headphones, multipoint connection",79.99,4.5 out of 5 stars,1,0,1,Not Specified,40,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1243,B07C4Q2HMQ,"mmuss sleep ultra thin pillow speakers with mic, control button for sleep headphones. headband headphone replacement",14.66,3.8 out of 5 stars,0,0,0,Not Specified,Not Specified,1,0
1294,B0C8FM211N,spuzzo premium bone conduction open-ear bluetooth 5.1 sport headphones - waterproof wireless headphones for workout and running with built-in microphone and 16g memory mp3 (color : blue),98.99,1.0 out of 5 stars,0,1,0,blue,Not Specified,1,0
1337,B0836MR393,"usb headset with microphone noise cancelling & audio controls, wideband pc headphone for business uc skype lync softphone call center office computer, clearer voice, super light, ultra comfort",28.95,4.2 out of 5 stars,0,0,1,Not Specified,Not Specified,1,0
1429,B0CBTZCXK1,"qaekie bone conduction headphones - bluetooth 5.3 open ear headphones with hd mic,12hrs playtime deep bass sport wireless headphones,sweatproof bone headphones for running,cycling,hiking,driving",67.29,3.9 out of 5 stars,0,1,0,Not Specified,12,1,0


### Feature Engineering : Gaming 

In [102]:
def is_gaming(description):
    regexp = r'\bgaming\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [103]:
df['Gaming'] = df['Description'].apply(is_gaming)

### Feature Engineering : Foldable 

In [104]:
def is_foldable(description):
    regexp = r'\bfoldable\b'

    if re.search(regexp, description.lower()):
        return 1
    else:
        return 0

In [105]:
df['Foldable'] = df['Description'].apply(is_foldable)

### Feature Engineering : Brand

In [106]:
import spacy

In [107]:
def get_brand(description):
    # Load a pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Process the text
    doc = nlp(description)
    # Extract named entities
    for ent in doc.ents:
        if ent.label_ == "ORG":  # ORG for organisations/brands
            return ent.text
        else:
            return 'Unknown Brand'

In [108]:
df['Brand'] = df['Description'].apply(get_brand)

In [109]:
df

Unnamed: 0,Product ID,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Colour,Battery Life,Microphone,Over Ear,Gaming,Foldable,Brand
0,B08HDBZNZ9,"infurture active noise cancelling headphones, h1 wireless over ear bluetooth headphones, deep bass headset, low latency, memory foam ear cups,40h playtime, for adults, kids, tv, travel, home office",49.99,4.3 out of 5 stars,1,1,1,Not Specified,40,0,1,0,0,Unknown Brand
1,B0C8SJSL9H,"runolim hybrid active noise cancelling headphones, wireless over ear bluetooth headphones with microphone, 70h playtime, foldable wireless headphones with hifi audio, deep bass for home travel office",25.99,4.4 out of 5 stars,0,1,1,Not Specified,70,1,1,0,1,Unknown Brand
2,B00I3LUYNG,sony mdrzx310l.ae foldable headphones - metallic blue,13.00,4.5 out of 5 stars,0,0,0,blue,Not Specified,0,0,0,1,sony
3,B00NBR70DO,"sony mdr-zx110 overhead headphones - black , basic, pack of 1",14.99,4.5 out of 5 stars,0,0,0,black,Not Specified,0,1,0,0,sony
4,B083P1HG9S,"lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle, wired headphones for smartphone tablet mp3 / 4 (space black)",15.99,4.4 out of 5 stars,0,0,0,black,Not Specified,1,1,0,1,Unknown Brand
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1337,B0836MR393,"usb headset with microphone noise cancelling & audio controls, wideband pc headphone for business uc skype lync softphone call center office computer, clearer voice, super light, ultra comfort",28.95,4.2 out of 5 stars,0,0,1,Not Specified,Not Specified,1,0,0,0,usb
1393,B088D48HZV,"lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle,wired headphones for smartphone tablet mp3 / 4 (vitality orange)",21.24,4.4 out of 5 stars,1,0,0,orange,Not Specified,1,1,0,1,Unknown Brand
1429,B0CBTZCXK1,"qaekie bone conduction headphones - bluetooth 5.3 open ear headphones with hd mic,12hrs playtime deep bass sport wireless headphones,sweatproof bone headphones for running,cycling,hiking,driving",67.29,3.9 out of 5 stars,0,1,0,Not Specified,12,1,0,0,0,Unknown Brand
1493,B00DSTBMOS,jvc haf160g gumy ear bud headphone green,20.16,4.4 out of 5 stars,0,0,0,green,Not Specified,0,0,0,0,Unknown Brand


In [110]:
df['Brand'].value_counts()

Brand
Unknown Brand                            287
sony                                       6
c8                                         5
radio & wired                              4
rgb                                        4
usb                                        3
doqaus bluetooth headphones over           3
louise & mann                              3
jyps kids wireless                         2
samsung                                    2
betron                                     2
jyps                                       2
konnao kids                                2
android                                    2
apple & android                            1
philips tat8506wt                          1
jyps kids headphones                       1
wt/00                                      1
jyps kids wireless headphones              1
skullcandy hesh                            1
microphone & share jack                    1
mac                                        1
pana

-----
**Comment:**

My attempt at using SpaCy to extract the brand names from Product Descrption did not work too well. 

Majority of the data set containes brand names which are unknown and the ones it managed to find - most are not a brand and are common words in description like wireless/bluetooth or a colours. Therefore, I will drop this column from the dataframe.

In [111]:
df = df.drop(columns = ['Brand'], axis = 0)

## Final Clean Up
---

In [112]:
df.head(10)

Unnamed: 0,Product ID,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Colour,Battery Life,Microphone,Over Ear,Gaming,Foldable
0,B08HDBZNZ9,"infurture active noise cancelling headphones, h1 wireless over ear bluetooth headphones, deep bass headset, low latency, memory foam ear cups,40h playtime, for adults, kids, tv, travel, home office",49.99,4.3 out of 5 stars,1,1,1,Not Specified,40,0,1,0,0
1,B0C8SJSL9H,"runolim hybrid active noise cancelling headphones, wireless over ear bluetooth headphones with microphone, 70h playtime, foldable wireless headphones with hifi audio, deep bass for home travel office",25.99,4.4 out of 5 stars,0,1,1,Not Specified,70,1,1,0,1
2,B00I3LUYNG,sony mdrzx310l.ae foldable headphones - metallic blue,13.0,4.5 out of 5 stars,0,0,0,blue,Not Specified,0,0,0,1
3,B00NBR70DO,"sony mdr-zx110 overhead headphones - black , basic, pack of 1",14.99,4.5 out of 5 stars,0,0,0,black,Not Specified,0,1,0,0
4,B083P1HG9S,"lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle, wired headphones for smartphone tablet mp3 / 4 (space black)",15.99,4.4 out of 5 stars,0,0,0,black,Not Specified,1,1,0,1
5,B09PQSVFQT,"kvidio bluetooth headphones over ear, 65 hours playtime wireless headphones with microphone, foldable lightweight headset with deep bass,hifi stereo sound for travel work pc cellphone (black)",14.2,4.5 out of 5 stars,0,1,0,black,65,1,1,0,1
6,B0CZ6T3BWM,"wireless earbuds, bluetooth 5.3 headphones with 4 enc noise canceling mic, 50h stereo dual led display ear buds, sport wireless earphones with earhooks, ip7 waterproof wireless headphones for running",19.99,4.7 out of 5 stars,0,1,0,Not Specified,3,1,0,0,0
7,B07XRV1XWX,"oneodio dj headphones, over ear headphones for studio monitoring and mixing, professional headset with stereo bass sound, foldable headphones suitable for electric drum keyboard guitar amplifier",24.47,4.4 out of 5 stars,0,0,0,Not Specified,Not Specified,0,1,0,1
8,B0C8V45ZF5,roxel rx-90 wired headphones with microphone - lightweight on ear headphones for android/ios devices - comfortable head cushion ergonomic - answer incoming calls - perfect for music lovers (black),12.99,4.3 out of 5 stars,0,0,0,black,Not Specified,1,0,0,0
9,B086D1Y52Q,"iclever hs18 over ear headphones with microphone - lightweight stereo headphones, adjustable foldable wired headphones with 3.5mm jack for online class/meeting/pc/phone/computer",16.99,4.5 out of 5 stars,0,0,0,Not Specified,Not Specified,1,1,0,1


### Rating

To remove `out of 5 stars` as this information is redundant information.

In [113]:
df['Rating'] = df['Rating'].astype('str')

In [114]:
df['Rating'] = df['Rating'].str.replace('out of 5 stars', '')

In [115]:
df['Rating'] = df['Rating'].astype(float)

## Export to CSV
----

In [116]:
cleaned_df = df.copy()

In [117]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 472 entries, 0 to 1495
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Product ID        472 non-null    object 
 1   Description       472 non-null    object 
 2   Price             472 non-null    object 
 3   Rating            472 non-null    float64
 4   Is Prime          472 non-null    int64  
 5   Wireless          472 non-null    int64  
 6   Noise Cancelling  472 non-null    int64  
 7   Colour            472 non-null    object 
 8   Battery Life      472 non-null    object 
 9   Microphone        472 non-null    int64  
 10  Over Ear          472 non-null    int64  
 11  Gaming            472 non-null    int64  
 12  Foldable          472 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 51.6+ KB


In [118]:
cleaned_df = cleaned_df.reset_index(drop = True)

In [119]:
cleaned_df.to_csv('../../data/cleaned_headphones_data.csv')

In [120]:
cleaned_df['Price'].sort_values()

332    10.12
363    10.76
125    10.99
49     10.99
97     10.99
       ...  
466    98.99
169    99.00
202    99.99
132    99.99
249    99.99
Name: Price, Length: 472, dtype: object

In [121]:
cleaned_df.head(10)

Unnamed: 0,Product ID,Description,Price,Rating,Is Prime,Wireless,Noise Cancelling,Colour,Battery Life,Microphone,Over Ear,Gaming,Foldable
0,B08HDBZNZ9,"infurture active noise cancelling headphones, h1 wireless over ear bluetooth headphones, deep bass headset, low latency, memory foam ear cups,40h playtime, for adults, kids, tv, travel, home office",49.99,4.3,1,1,1,Not Specified,40,0,1,0,0
1,B0C8SJSL9H,"runolim hybrid active noise cancelling headphones, wireless over ear bluetooth headphones with microphone, 70h playtime, foldable wireless headphones with hifi audio, deep bass for home travel office",25.99,4.4,0,1,1,Not Specified,70,1,1,0,1
2,B00I3LUYNG,sony mdrzx310l.ae foldable headphones - metallic blue,13.0,4.5,0,0,0,blue,Not Specified,0,0,0,1
3,B00NBR70DO,"sony mdr-zx110 overhead headphones - black , basic, pack of 1",14.99,4.5,0,0,0,black,Not Specified,0,1,0,0
4,B083P1HG9S,"lorelei x6 over-ear headphones with microphone, lightweight foldable & portable stereo bass headphones with 1.45m no-tangle, wired headphones for smartphone tablet mp3 / 4 (space black)",15.99,4.4,0,0,0,black,Not Specified,1,1,0,1
5,B09PQSVFQT,"kvidio bluetooth headphones over ear, 65 hours playtime wireless headphones with microphone, foldable lightweight headset with deep bass,hifi stereo sound for travel work pc cellphone (black)",14.2,4.5,0,1,0,black,65,1,1,0,1
6,B0CZ6T3BWM,"wireless earbuds, bluetooth 5.3 headphones with 4 enc noise canceling mic, 50h stereo dual led display ear buds, sport wireless earphones with earhooks, ip7 waterproof wireless headphones for running",19.99,4.7,0,1,0,Not Specified,3,1,0,0,0
7,B07XRV1XWX,"oneodio dj headphones, over ear headphones for studio monitoring and mixing, professional headset with stereo bass sound, foldable headphones suitable for electric drum keyboard guitar amplifier",24.47,4.4,0,0,0,Not Specified,Not Specified,0,1,0,1
8,B0C8V45ZF5,roxel rx-90 wired headphones with microphone - lightweight on ear headphones for android/ios devices - comfortable head cushion ergonomic - answer incoming calls - perfect for music lovers (black),12.99,4.3,0,0,0,black,Not Specified,1,0,0,0
9,B086D1Y52Q,"iclever hs18 over ear headphones with microphone - lightweight stereo headphones, adjustable foldable wired headphones with 3.5mm jack for online class/meeting/pc/phone/computer",16.99,4.5,0,0,0,Not Specified,Not Specified,1,1,0,1


## Conclusion
-------

