### Exploratory Data Analysis of Clean Green Singapore (CGS) Recyclables Dataset

Source: https://www.cgs.gov.sg/recyclingsearch/data.json

In [1]:
from common import *
import bs4

# Initialise dataset
cgs_data = None
with open(recyclability_dataset_file, 'r') as file:
    cgs_data = file.read()
    cgs_data = json.loads(cgs_data)
    
# Initialise material class mapping
material_class_mapping = {0: 'Others', 1: 'Paper', 2: 'Plastic', 3: 'Glass', 4: 'Metal'}
inverse_material_class_mapping = {v: k for k, v in material_class_mapping.items()}

In [2]:
""" Data Preprocessing """
# Basic cleaning
df_cgs = df_cgs = pd.DataFrame(cgs_data, columns=['id', 'materialType', 'itemName', 'canBePlaced', 'description', 'suggestedWorkds', 'Material Class']).rename(columns={'id': 'ID', 'materialType': 'Material', 'itemName': 'Item', 'canBePlaced': 'Recyclability', 'description': 'Description', 'suggestedWorkds': 'Related Words'})
df_cgs = df_cgs[['Item', 'Material', 'Material Class', 'Recyclability', 'Description', 'Related Words']]
df_cgs.replace({'YES!': 1, 'NOT IN ME!': 0, 'NO!': 0}, inplace=True)
df_cgs.replace({'Non Food Plastic': 'Plastic', 'Non-Food Plastic': 'Plastic'}, inplace=True)
df_cgs.replace({'F&B Plastic': 'Plastic'}, inplace=True)

for index, row in df_cgs.iterrows():
    # Clean description
    description = row['Description']
    # Extract links
    match = re.match(r".+<a href=\'(.+)\' target.+>", description)
    if(match):
        description = description.replace("here", match.group(1))
    description = bs4.BeautifulSoup(description, "html.parser").get_text()
    description = ''.join([i for i in description.split('•')])
    description = '. '.join([i.strip().capitalize() for i in description.split('. ')])
    description = description.replace('  ', ' ')
    df_cgs.at[index, 'Description'] = description
    # Set material class
    df_cgs.at[index, 'Material Class'] = inverse_material_class_mapping[row['Material']]

# Update material class datatype
df_cgs['Material Class'] = df_cgs['Material Class'].astype(int)

# Get material-item mapping
dict_material_items = {}
for material in df_cgs['Material'].unique():
    sub_categories = list(df_cgs[df_cgs['Material'] == material]['Item'].unique())
    dict_material_items[material] = sub_categories

# Save dataframes
save_to_pickle(recyclability_file, df_cgs)
save_to_pickle(material_items_file, dict_material_items)

  df_cgs.replace({'YES!': 1, 'NOT IN ME!': 0, 'NO!': 0}, inplace=True)


In [3]:
df_cgs['Material'].value_counts()

Material
Others     122
Plastic     68
Paper       62
Glass       52
Metal       18
Name: count, dtype: int64

In [4]:
df_cgs

Unnamed: 0,Item,Material,Material Class,Recyclability,Description,Related Words
0,Printed paper (Glossy and non-glossy),Paper,1,1,Make sure it is clean before recycling.,[Printed paper (Glossy and non-glossy)]
1,Writing paper,Paper,1,1,Make sure it is clean before recycling.,[Writing paper]
2,Paper,Paper,1,1,Make sure it is clean before recycling.,"[Paper, Exam Papers, Notes, Notebook, Mail, Le..."
3,Clothes,Others,0,0,Clothes should be donated if they are in good ...,"[T-Shirt, Shorts, Pants, Skirt, Jacket, Shirts..."
4,Newspaper,Paper,1,1,Make sure it is clean before recycling.,[Newspaper]
...,...,...,...,...,...,...
317,Stationery,Others,0,0,Donate if it is in good condition.,"[marker, whiteboard marker, permanent Marker, ..."
318,Bulky waste,Others,0,0,Donate if it is in good condition or contact t...,"[Bulky waste, cushion, Fan, Couch, Sofa, Chair..."
319,Furniture,Others,0,0,Donate if it is in good condition or contact t...,"[Fan, Couch, Sofa, Chair, Coffee Table, Mattre..."
320,Sports shoes,Others,0,0,Repair or donate your used shoes as far as pos...,"[Sneakers, Nike, Adidas, Asics, Puma, Reebok, ..."


In [5]:
dict_material_items

{'Paper': ['Printed paper (Glossy and non-glossy)',
  'Writing paper',
  'Paper',
  'Newspaper',
  'Flyer (Glossy and non-glossy)',
  'Magazine (Glossy and non-glossy)',
  'Telephone directory',
  'Envelope (With and without plastic window)',
  'Plastic envelope',
  'Red packet',
  'Namecard',
  'Greeting card',
  'Shredded paper',
  'Paper receipt',
  'Carton box',
  'Cardboard box',
  'Printed paper box',
  'Paper egg trays',
  'Milk carton',
  'Drink packet',
  'Juice packet',
  'Paper towel tube',
  'Toilet roll tube',
  'Tissue box',
  'Paper bag',
  'Paper disposables',
  'Paper cup',
  'Paper plate',
  'Glitter paper',
  'Crayon drawing',
  'Tissue paper',
  'Paper towel',
  'Toilet paper',
  'Disposable wooden chopsticks',
  'Wooden chopsticks',
  'Pizza boxes',
  'Wax paper',
  'Paper packaging contaminated with food',
  'Paper packaging with food',
  'Newsletter',
  'Brochure (Glossy and non-glossy)',
  'Books',
  'Textbooks',
  'Calendar',
  'Gift wrapping paper',
  'Paper P