In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
chem = pd.read_csv('../input/chemicals-in-cosmetics/chemicals-in-cosmetics.csv')
chem.head()

In [None]:
chem.tail()

In [None]:
chem.shape

In [None]:
chem.describe().transpose()

In [None]:
chem.isnull().sum()

In [None]:
len(pd.unique(chem['CompanyName']))

In [None]:
len(pd.unique(chem['CompanyId']))

In [None]:
chem.nunique()

In [None]:
chem.isnull().sum() / chem.shape[0] * 100

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (15,10))
plt.hist(chem['ChemicalName'])

In [None]:
import seaborn as sn

In [None]:
plt.figure(figsize = (20,30))
chem.groupby(['ChemicalName']).sum().plot(kind='pie', y='ChemicalCount')

In [None]:
chem['ChemicalName'].value_counts().size

In [None]:
chem.loc[chem.ChemicalCount==0]['ChemicalDateRemoved'].isnull().max()

In [None]:
data = chem.groupby(['ChemicalCount']).nunique()['CDPHId']

fig = plt.figure(figsize=(9,7))
ax = plt.subplot(111)
ax.bar(data.index, data.values, align='center', alpha=0.5, edgecolor='k')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xticks(np.arange(1,10))

for x,y in zip(data.index,data.values):
    plt.annotate(y, (x,y), textcoords="offset points", xytext=(0,4), ha='center') 

ax.set_title('Number of reported products containing chemicals', fontsize=15)
ax.title.set_position([.5, 1.05])
ax.set_xlabel('Number of chemicals', fontsize=12)
ax.set_ylabel('Number of products ', fontsize=12)

plt.show()

In [None]:
plt.figure(figsize=[15,5])
(chem[chem.ChemicalCount > 0]['ChemicalName'].value_counts(normalize=True) * 100)[:10].plot(kind='bar', color='green')

plt.ylabel('Percentage', fontdict={'size':15})
plt.xlabel('Chemical Name', fontdict={'size':15})
plt.title('Top 10 Reported Chemical', fontdict={'size':20, 'color':'blue'})

plt.grid()
plt.show()

In [None]:
chem['PrimaryCategory'].value_counts()

In [None]:
plt.figure(figsize=[10,5])
sn.barplot(x=chem['PrimaryCategory'].value_counts(normalize=True) * 100,
            y=chem['PrimaryCategory'].value_counts().index, palette='Set1')

plt.xlabel('Percentage', fontdict={'size':15})
plt.ylabel('Primary Category', fontdict={'size':15})
plt.title('Distribution of Product over Primary Category', fontdict={'size':20, 'color':'royalblue'})
plt.grid()
plt.show()

In [None]:
baby_prod = chem.loc[chem['PrimaryCategory']=='Baby Products']
baby_prod.head()

In [None]:
print('The chemicals in baby products are:\n\n',baby_prod['ChemicalName'].value_counts())

In [None]:
baby_prod_chem = baby_prod['ChemicalName'].value_counts()

In [None]:
reported_baby_prod = baby_prod[['ProductName', 'CompanyName', 'SubCategory']].sort_values('SubCategory')
reported_baby_prod.columns=['Baby product', 'Company', 'Type of product']
reported_baby_prod.style.hide_index()

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
ax.barh(baby_prod_chem.index, baby_prod_chem.values, color='red', alpha=0.6)

ax.xaxis.grid(linestyle='--', linewidth=0.5)

for x,y in zip(baby_prod_chem.values,baby_prod_chem.index):
    ax.annotate(x, (x,y), textcoords="offset points", xytext=(4,0), va='center') 

ax.set_title('Chemicals in baby products', fontsize=15)
ax.title.set_position([0.5,1.02])
ax.set_xlabel('Number of baby products', fontsize=12)
ax.set_xticks(np.arange(0,18,5))
plt.text(-0.15,-0.2, "* ", size=12, transform=ax.transAxes)

plt.show()