In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import boto3
import yaml
from io import BytesIO

In [8]:
def extract_from_s3(s3_address, aws_credentials_path='aws_access.yaml'):
    # Read AWS credentials from YAML file
    with open(aws_credentials_path, 'r') as file:
        aws_credentials = yaml.safe_load(file)

    # Extract credentials from the YAML file
    aws_access_key_id = aws_credentials['aws_access_key_id']
    aws_secret_access_key = aws_credentials['aws_secret_access_key']

    # Extract bucket and key from S3 address
    bucket_name, key = s3_address.split('s3://')[1].split('/', 1)

    # Create an S3 client
    s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

    # Download the file from S3
    response = s3.get_object(Bucket=bucket_name, Key=key)
    content = response['Body'].read()

    # Convert the content to a DataFrame
    product_details_data = pd.read_csv(BytesIO(content))
    product_details_data = product_details_data.set_index(product_details_data.columns[0])
    return product_details_data

aws_credentials_path = 'aws_access.yaml'
s3_address = 's3://data-handling-public/products.csv'
products_list = extract_from_s3(s3_address, aws_credentials_path)

In [9]:
products_list.head()

Unnamed: 0_level_0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,FurReal Dazzlin' Dimples My Playful Dolphin,£39.99,1.6kg,toys-and-games,7425710935115,2005-12-02,83dc0a69-f96f-4c34-bcb7-928acae19a94,Still_avaliable,R7-3126933h
1,Tiffany's World Day Out At The Park,£12.99,0.48kg,toys-and-games,487128731892,2006-01-09,712254d7-aea7-4310-aff8-8bcdd0aec7ff,Still_avaliable,C2-7287916l
2,Tiffany's World Pups Picnic Playset,£7.00,590g,toys-and-games,1945816904649,1997-03-29,b089ef6f-b628-4e37-811d-fffe0102ba64,Still_avaliable,S7-1175877v
3,Tiffany's World Wildlife Park Adventures,£12.99,540g,toys-and-games,1569790890899,2013-03-20,d55de422-8b98-47d6-9991-e4bc4c5c0cb0,Removed,D8-8421505n
4,Cosatto Cosy Dolls Pram,£30.00,1.91kg,toys-and-games,7142740213920,2007-12-23,7945b657-cb02-4cc5-96cf-f65ed0a8f235,Still_avaliable,B6-2596063a


In [10]:
products_list.describe(include="all")

Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
count,1849,1849,1849,1849,1849,1849,1849,1849,1849
unique,1021,132,483,10,1849,1704,1849,5,1849
top,Maine 3 Tier Corner Shelving - White,£25.00,2.9kg,homeware,7425710935115,2007-01-15,83dc0a69-f96f-4c34-bcb7-928acae19a94,Still_avaliable,R7-3126933h
freq,12,113,30,1138,1,3,1,1752,1


In [27]:
products_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1850 entries, 0 to 1852
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_name   1849 non-null   object
 1   product_price  1849 non-null   object
 2   weight         1849 non-null   object
 3   category       1849 non-null   object
 4   EAN            1849 non-null   object
 5   date_added     1849 non-null   object
 6   uuid           1849 non-null   object
 7   removed        1849 non-null   object
 8   product_code   1849 non-null   object
dtypes: object(9)
memory usage: 144.5+ KB


In [26]:
products_list.duplicated().unique()


array([False])

In [25]:
duplicated_rows = products_list[products_list.duplicated()]
print(duplicated_rows)

Empty DataFrame
Columns: [product_name, product_price, weight, category, EAN, date_added, uuid, removed, product_code]
Index: []


In [23]:
products_list = products_list.drop_duplicates()

In [24]:
products_list.describe(include="all")

Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
count,1849,1849,1849,1849,1849,1849,1849,1849,1849
unique,1021,132,483,10,1849,1704,1849,5,1849
top,Maine 3 Tier Corner Shelving - White,£25.00,2.9kg,homeware,7425710935115,2007-01-15,83dc0a69-f96f-4c34-bcb7-928acae19a94,Still_avaliable,R7-3126933h
freq,12,113,30,1138,1,3,1,1752,1


In [28]:
products_list[products_list.isna().any(axis=1)]

Unnamed: 0_level_0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
266,,,,,,,,,


In [29]:
products_list = products_list.dropna()

In [30]:
products_list[products_list.isna().any(axis=1)]

Unnamed: 0_level_0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [32]:
non_numeric_mask = ~products_list['EAN'].str.isnumeric()

# Use the mask to display rows with non-numeric characters in the 'EAN' column
rows_with_non_numeric = products_list[non_numeric_mask]

# Display the rows with non-numeric characters in the 'EAN' column
print(rows_with_non_numeric)

           product_name product_price      weight    category         EAN  \
Unnamed: 0                                                                  
751          VLPCU81M30    XCD69KUI0K  9GO9NZ5JTL  S1YB74MLMJ  OO7KH8P79I   
1133         9SX4G65YUX    N9D2BZQX63  Z8ZTDGUZVU  C3NCA2CL35  E8EOGWOY8S   
1400         LB3D71C025    ODPMASE7V7  MX180RYSHX  WVPMHZP59U  BHPF2JTNKQ   

            date_added        uuid     removed product_code  
Unnamed: 0                                                   
751         CCAVRB79VV  7QB0Z9EW1G  T3QRRH7SRP   SDAV678FVD  
1133        09KREHTMWL  CP8XYQVGGU  BPSADIOQOK  BSDTR67VD90  
1400        PEPWA0NCVH  VIBLHHVPMN  H5N71TV8AY   OPSD21HN67  


In [33]:
products_list = products_list.drop(products_list[non_numeric_mask].index)

In [34]:
non_numeric_mask = ~products_list['EAN'].str.isnumeric()

# Use the mask to display rows with non-numeric characters in the 'EAN' column
rows_with_non_numeric = products_list[non_numeric_mask]

# Display the rows with non-numeric characters in the 'EAN' column
print(rows_with_non_numeric)

Empty DataFrame
Columns: [product_name, product_price, weight, category, EAN, date_added, uuid, removed, product_code]
Index: []


In [35]:
products_list.describe(include="all")

Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
count,1846,1846,1846,1846,1846,1846,1846,1846,1846
unique,1018,129,480,7,1846,1701,1846,2,1846
top,Maine 3 Tier Corner Shelving - White,£25.00,2.9kg,homeware,7425710935115,2001-10-20,83dc0a69-f96f-4c34-bcb7-928acae19a94,Still_avaliable,R7-3126933h
freq,12,113,30,1138,1,3,1,1752,1


In [36]:
products_list.isnull().sum()

product_name     0
product_price    0
weight           0
category         0
EAN              0
date_added       0
uuid             0
removed          0
product_code     0
dtype: int64

In [38]:
products_list.category.unique()

array(['toys-and-games', 'sports-and-leisure', 'pets', 'homeware',
       'health-and-beauty', 'food-and-drink', 'diy'], dtype=object)