In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_original = pd.read_csv('../input/chemicals-in-cosmetics/chemicals-in-cosmetics.csv')
df = df_original.drop_duplicates()
print('The original database shape:', df_original.shape)
print('Database without duplicates:', df.shape)


In [None]:
df.head()

## Investigating chemical counts

In [None]:
df['ChemicalName'].value_counts().size

In [None]:
df['ChemicalCount'].describe()

In average, products contain at least one chemical. Notice there are products with 0 chemicals, and there are products with 9 reported chemicals.

Let's first investigate products where 'ChemicalCount'=0.

In [None]:
df.loc[df.ChemicalCount==0].head()


In [None]:
 # when the result is False, there are no NaN values
df.loc[df.ChemicalCount==0]['ChemicalDateRemoved'].isnull().max()

In [None]:
df_n0 = df.loc[(df.ChemicalCount>0) & (df['DiscontinuedDate'].isna())]

In [None]:
df_n0.loc[df.ChemicalCount==9]

It turns out it is only one product, where each chemical is separately reported.

The following code is used to generate the bar chart showing the number of products per number of chemicals. In counting the number of products, different color, scent and/or flavor of the product are neglected (e.g. 'Professional Eyeshadow Base' can be beige or bright, but it is counted only once with the identification number 'CDPHId'=26).

In [None]:
df_n0.loc[df['CDPHId']==26]

In [None]:
import matplotlib.pyplot as plt
data = df_n0.groupby(['ChemicalCount']).nunique()['CDPHId']

fig = plt.figure(figsize=(9,7))
ax = plt.subplot(111)
ax.bar(data.index, data.values, log=True, align='center', alpha=0.5, edgecolor='k')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xticks(np.arange(1,10))

for x,y in zip(data.index,data.values):
    plt.annotate(y, (x,y), textcoords="offset points", xytext=(0,4), ha='center') 

ax.set_title('Number of reported products containing chemicals', fontsize=15)
ax.title.set_position([.5, 1.05])
ax.set_xlabel('Number of chemicals', fontsize=12)
ax.set_ylabel('Number of products (log scale)', fontsize=12)

plt.show()


In [None]:
baby_prod = df_n0.loc[df_n0['PrimaryCategory']=='Baby Products']
baby_prod.head()

In [None]:
baby_prod_chem = baby_prod['ChemicalName'].value_counts()
print(baby_prod_chem)

In [None]:
long_text = baby_prod_chem.index[2]
print('Old chemical name: ', long_text)
print()
baby_prod_chem.rename({baby_prod_chem.index[2]: 'Retinol *'}, inplace=True)
print('New chemical name: ', baby_prod_chem.index[2])

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
ax.barh(baby_prod_chem.index, baby_prod_chem.values, color='red', alpha=0.6)

ax.xaxis.grid(linestyle='--', linewidth=0.5)

for x,y in zip(baby_prod_chem.values,baby_prod_chem.index):
    ax.annotate(x, (x,y), textcoords="offset points", xytext=(4,0), va='center') 

ax.set_title('Chemicals in baby products', fontsize=15)
ax.title.set_position([0.5,1.02])
ax.set_xlabel('Number of baby products', fontsize=12)
ax.set_xticks(np.arange(0,18,5))
plt.text(-0.15,-0.2, "* "+long_text, size=12, transform=ax.transAxes)

plt.show()

In [None]:
reported_baby_prod = baby_prod[['ProductName', 'CompanyName', 'SubCategory']].sort_values('SubCategory')
reported_baby_prod.columns=['Baby product', 'Company', 'Type of product']
reported_baby_prod.style.hide_index()