# Setup

- Load Dependancies 
- Import Datasets 
- Cleanse Whitespaces

In [1]:
import pandas as pd
import os

In [2]:
currys_products = pd.read_csv('../Resources/Website_List/Currys_Product.csv', skipinitialspace = True)
very_products = pd.read_csv('../Resources/Website_List/Very_Product.csv', skipinitialspace = True)

# Data Cleanse and Exploration Process

## All Currys Products

In [3]:
# Print currys_products csv as dataframe

currys_products

Unnamed: 0,"LG OLED48C14LB 48"" Smart 4K Ultra HD HDR OLED TV with Google Assistant & Amazon Alexa",869.0,4.7,239.0,TV
0,"SAMSUNG QE50QN94AATXXU 50"" Smart 4K Ultra HD H...",799.00,4.45,172.0,TV
1,"SAMSUNG QE65QN95AATXXU 65"" Smart 4K Ultra HD H...",1499.00,4.45,142.0,TV
2,"LG OLED42C24LA 42"" Smart 4K Ultra HD HDR OLED ...",1299.00,,,TV
3,"SAMSUNG QE55QN700ATXXU 55"" Smart 8K HDR Neo QL...",1199.00,4.05,12.0,TV
4,"SAMSUNG The Frame QE43LS03AAUXXU 43"" Smart 4K ...",699.00,4.30,128.0,TV
...,...,...,...,...,...
5132,FRESH N REBEL Rockbox Bold XS Portable Bluetoo...,38.99,4.00,1.0,TV
5133,FLEXSON FLXS1FS2011EU SONOS One Floorstand Fix...,89.00,4.65,3.0,Phones
5134,FLEXSON S1-SPC Sonos Speaker Power Adapter - B...,9.99,,,Phones
5135,AUDIO PRO D-1 Wireless Multi-room Speaker - Black,350.00,,,Phones


In [4]:
# Insert columns titles for dataframe

currys_products.columns =['product_name', 'price', 'customer_rating', 'rating_count', 'category']

In [5]:
# Organise column positions

currys_products = currys_products[['product_name', 'category','price', 'customer_rating', 'rating_count']]

In [6]:
# Preview dataframe

currys_products.head()

Unnamed: 0,product_name,category,price,customer_rating,rating_count
0,"SAMSUNG QE50QN94AATXXU 50"" Smart 4K Ultra HD H...",TV,799.0,4.45,172.0
1,"SAMSUNG QE65QN95AATXXU 65"" Smart 4K Ultra HD H...",TV,1499.0,4.45,142.0
2,"LG OLED42C24LA 42"" Smart 4K Ultra HD HDR OLED ...",TV,1299.0,,
3,"SAMSUNG QE55QN700ATXXU 55"" Smart 8K HDR Neo QL...",TV,1199.0,4.05,12.0
4,"SAMSUNG The Frame QE43LS03AAUXXU 43"" Smart 4K ...",TV,699.0,4.3,128.0


In [7]:
# View dataframe types 

currys_products.dtypes

product_name        object
category            object
price              float64
customer_rating    float64
rating_count       float64
dtype: object

In [8]:
# Replace NaN values in Rating Count

currys_products['rating_count'] = currys_products['rating_count'].fillna(0)

In [9]:
# Preview dataframe

currys_products.head()

Unnamed: 0,product_name,category,price,customer_rating,rating_count
0,"SAMSUNG QE50QN94AATXXU 50"" Smart 4K Ultra HD H...",TV,799.0,4.45,172.0
1,"SAMSUNG QE65QN95AATXXU 65"" Smart 4K Ultra HD H...",TV,1499.0,4.45,142.0
2,"LG OLED42C24LA 42"" Smart 4K Ultra HD HDR OLED ...",TV,1299.0,,0.0
3,"SAMSUNG QE55QN700ATXXU 55"" Smart 8K HDR Neo QL...",TV,1199.0,4.05,12.0
4,"SAMSUNG The Frame QE43LS03AAUXXU 43"" Smart 4K ...",TV,699.0,4.3,128.0


In [10]:
# Save cleansed data to new CSV

currys_products.to_csv('../Clean_Datasets/All_Currys_Products.csv', index=False)

## All Very Products

In [11]:
# Print very_products csv as dataframe

very_products

Unnamed: 0,TV,"LG OLED48C14LB 48"" Smart 4K Ultra HD HDR OLED TV with Google Assistant & Amazon Alexa",869.0,4.7,346.0
0,TV,,,,
1,TV,,,,
2,TV,,,,
3,TV,"HISENSE 55A7GQTUK 55"" Smart 4K Ultra HD HDR QL...",449.0,4.6,115.0
4,TV,,,,
...,...,...,...,...,...
2345,Tumble Dryers,,,,
2346,Tumble Dryers,HOTPOINT H3 D81WB UK 8 kg Condenser Tumble Dry...,359.0,4.4,5.0
2347,Tumble Dryers,,,,
2348,Tumble Dryers,,,,


In [12]:
# Insert columns titles for dataframe

very_products.columns =['category','product_name', 'price', 'customer_rating', 'rating_count']

In [13]:
# Preview dataframe

very_products.head()

Unnamed: 0,category,product_name,price,customer_rating,rating_count
0,TV,,,,
1,TV,,,,
2,TV,,,,
3,TV,"HISENSE 55A7GQTUK 55"" Smart 4K Ultra HD HDR QL...",449.0,4.6,115.0
4,TV,,,,


In [14]:
# Organise column positions

very_products = very_products[['product_name', 'category', 'price', 'customer_rating', 'rating_count']]

In [15]:
# Preview dataframe

very_products.head()

Unnamed: 0,product_name,category,price,customer_rating,rating_count
0,,TV,,,
1,,TV,,,
2,,TV,,,
3,"HISENSE 55A7GQTUK 55"" Smart 4K Ultra HD HDR QL...",TV,449.0,4.6,115.0
4,,TV,,,


In [16]:
# Drop NaN values in dataset

#very_products= very_products.dropna(subset=['Product Name', 'Price (£)', 'Customer Rating', 'Rating Count'], how='all')

very_products= very_products.dropna(subset=['product_name'], how='all')
very_products = very_products.reset_index(drop=True)
very_products

Unnamed: 0,product_name,category,price,customer_rating,rating_count
0,"HISENSE 55A7GQTUK 55"" Smart 4K Ultra HD HDR QL...",TV,449.0,4.6,115.0
1,"LG OLED83C14LA 83"" Smart 4K Ultra HD HDR OLED ...",TV,3999.0,4.8,284.0
2,"LG OLED65C14LB 65"" Smart 4K Ultra HD HDR OLED ...",TV,1399.0,4.8,445.0
3,"LG OLED55A16LA 55"" Smart 4K Ultra HD HDR OLED ...",TV,719.0,4.8,197.0
4,"HISENSE 50A6GTUK 50"" Smart 4K Ultra HD HDR LED...",TV,279.0,4.5,77.0
...,...,...,...,...,...
270,SAMSUNG Series 5 DV90TA040AX/EU 9 kg Heat Pump...,Tumble Dryers,699.0,3.7,9.0
271,SAMSUNG Series 5 DV80TA020AX/EU 8 kg Heat Pump...,Tumble Dryers,719.0,5.0,8.0
272,HOTPOINT H2 D81W UK 8 kg Condenser Tumble Drye...,Tumble Dryers,289.0,4.8,8.0
273,SAMSUNG Series 5 DV90TA040AE/EU 9 kg Heat Pump...,Tumble Dryers,649.0,4.2,12.0


In [17]:
# View dataframe types 

very_products.dtypes

product_name        object
category            object
price              float64
customer_rating    float64
rating_count       float64
dtype: object

In [18]:
# Save cleansed data to new CSV

very_products.to_csv('../Clean_Datasets/All_Very_Products.csv', index=False)