# Cocoa Beans Quality Analysis

# **1. Load the dataset and display top 2 values**

In [None]:
import pandas as pd
df = pd.read_csv('flavors.csv')
df.head(2)

# csv - comma separated values

#https://www.kaggle.com/rtatman/chocolate-bar-ratings

# **2. Remove unwanted characters in the column names and keep all the letters in lowercase**

In [None]:
df.columns = df.columns.str.replace('\n','_')
df.columns = df.columns.str.lower()
df.columns

In [None]:
df.columns[-1]

In [None]:
df = df.rename(columns={df.columns[0]:'company',
          df.columns[1]:'bean_origin',
          df.columns[-1]:'broad_bean_origin'})
df.head()

## **3. Which rating is the most popular for chocolates?**

In [None]:
# Method 2
r = df['rating'].value_counts().sort_index(ascending=True)
r

In [None]:
import matplotlib.pyplot as plt
plt.bar(r.index.astype(str),r,color=['r','g'])
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Bar Graph')
plt.show()

In [None]:
import seaborn as sns
sns.kdeplot(df['rating'])

# **4. Checking for missing values**

In [None]:
df.isnull().sum()

In [None]:
# Check for np.where and use it with isnull()
df[['bean_type']]

In [None]:
df['bean_type'].unique()
# \xa0 is actually non-breaking space in Latin1 (ISO 8859-1)

In [None]:
df['bean_type'].value_counts()

# Replace the space with missing value and change it with mode

In [None]:
df['bean_type'] = df['bean_type'].replace({'\xa0':None})
df['bean_type'] = df['bean_type'].fillna(df['bean_type'].mode()[0])
df['bean_type']

# https://stackoverflow.com/questions/38117016/update-pandas-dataframe-with-str-replace-vs-replace

In [None]:
df['bean_type'].mode()

# **5. Corrections of words in the column by removing punctuations. Plot top 5 Bean Types**

In [None]:
b = df['bean_type'].value_counts().head()
b

In [None]:
# Pandas Plotting

b.plot(kind='bar',figsize=(10,4))
for i in range(len(b)):
  plt.text(i,b[i],b[i],fontsize=15,ha='center')
plt.xlabel('Cocao Bean Type',fontsize=20)
plt.ylabel('Count')

# **6. Cocoa percent over the years**

In [None]:
df['cocoa_percent'] = df['cocoa_percent'].str.replace('%','').astype(float)/100
df['cocoa_percent']

In [None]:
plt.scatter(df['rating'],df['cocoa_percent'])
plt.show()

In [None]:
plt.scatter(df['review_date'],df['cocoa_percent'])
plt.show()

In [None]:
# 1st Method
cocoa = df[['review_date','cocoa_percent']]
cocoa_avg = []
for i in range(2006,2018):
  cocoa_avg.append(cocoa[cocoa['review_date']==i].mean()['cocoa_percent'])

plt.figure(figsize=(15,3))
plt.scatter(range(2006,2018),cocoa_avg)
plt.plot(range(2006,2018),cocoa_avg,color='r')

In [None]:
# Groupby ; Method in Pandas which creates DF/Series based on certain condition

df1 = df.groupby('review_date').aggregate({'cocoa_percent':'mean'})
df1.plot(figsize=(15,3))

Interestingly, In the year 2008, the cocoa percentage in chocolate was highest and the average rating happened to be the lowest.

The following year in 2009, the chocolate bars saw a steep decline in cocoa percentage, with an increase in average rating. This might indicate that chocolate bar producers decreased their cocoa content to make better chocolates.

# **7. Which company is mentioned multiple times?**

In [None]:
# Seaborn is an advanced data visualization library built on top of matplotlib
d = df['company'].value_counts().head(5)
d = pd.DataFrame(d)
d = d.reset_index()
d

In [None]:
import seaborn as sns
sns.barplot(x ='index',y='company',data=d)

# **8. Consider Ratings and divide them into different categories and plot a pie chart**

In [None]:
label = ['Disappointing','Satisfactory',' Premium and Elite']

disapoint = df[df['rating']<2]
satisfactory = df[(df['rating']>=2.0) &(df['rating']<3.0)]
premium_elite = df[df['rating']>=3.0]

x = [disapoint.shape[0],satisfactory.shape[0],premium_elite.shape[0]]
plt.figure(figsize=(5,5))
plt.pie(x,labels=label)
plt.show()

# **9. Feature Generation. We can classify the chocolates into Dark, Normal chocolates.**

In [None]:
df[df['cocoa_percent']<0.7].sample(10)
df['chocolate_type'] = df['cocoa_percent'].apply(lambda x: 'dark' if x>=0.7 else 'normal')

In [None]:
df['chocolate_type'].value_counts()

In [None]:
df['chocolate_type'].value_counts().plot(kind='bar')

We have much Dark chocolates than normal

# **10. Where the best cocoa beans are grown (based on rating)**

In [None]:
countries=df['broad_bean_origin'].value_counts().index.tolist()[:5]
satisfactory={} 
for j in countries:
    c=0
    b=df[df['broad_bean_origin']==j]
    br=b[b['rating']>=3] 
    for i in br['rating']:
        c+=1
        satisfactory[j]=c    

# Code to visualize the countries that give best cocoa beans
li=satisfactory.keys()
plt.figure(figsize=(10,5))
plt.bar(range(len(satisfactory)), satisfactory.values(), align='center',color=['#a22a2a','#511515','#e59a9a','#d04949','#a22a2a'])
plt.xticks(range(len(satisfactory)), list(li))
plt.xlabel('\nCountry')
plt.ylabel('Number of chocolate bars')
plt.title("Top 5 Broad origins of the Chocolate Beans with a Rating above 3.0\n")
plt.show()

print(satisfactory)

## Exercise 1: Analyse the top chocolate bar producing countries (in terms of quantity)

## Exercise 2: A manufacturer of chocolates needs to avoid bad cocoa beans for manufacturing quality chocolate products. Help him Find & Analyse worst cocoa beans producing countries based on ratings