In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import re
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
chocodata = pd.read_csv('../input/flavors_of_cacao.csv')

In [None]:
chocodata.head()

**Data Preparation will be done in the below steps**

In [None]:
chocodata.columns

In [None]:
#Changing Column Names
old_colnames = chocodata.columns
new_colnames = ['Company', 'BeanOriginBarName', 'REF', 'ReviewDate', 'Cocoa', 'CompanyLocation', 'Rating', 'BeanType', 'BeanOrigin']
chocodata = chocodata.rename(columns = dict(zip(old_colnames, new_colnames)))

In [None]:
chocodata.head()

In [None]:
#Converting Cocoa column to float
chocodata['Cocoa'] = chocodata['Cocoa'].str.replace('%','').astype(float)
chocodata.head()

In [None]:
chocodata['CompanyLocation'].sort_values().unique()

In [None]:
#Fixing the issues in Company Location Names
chocodata['CompanyLocation'] = chocodata['CompanyLocation'].str.replace('Eucador','Ecuador')\
                               .str.replace('Amsterdam','Netherlands')\
                               .str.replace('Belgium','Germany')\
                               .str.replace('Domincan Republic', 'Dominican Republic')\
                               .str.replace('Niacragua', 'Nicaragua')\
                               .str.replace('U.K.', 'England')\
                               .str.replace('U.S.A.', 'United States of America')                                  

In [None]:
chocodata['CompanyLocation'].sort_values().unique()

In [None]:
#Checking for data issues in Bean Origin
chocodata['BeanOrigin'].sort_values().unique()

In [None]:
#Finding No. of entries for each Bean Origin Location
chocodata['BeanOrigin'].value_counts().head()

In [None]:
#Finding no. of NULL values
chocodata['BeanOrigin'].isnull().value_counts()

In [None]:
#Identifying the record with NULL value in BeanOrigin
chocodata[chocodata['BeanOrigin'].isnull() == True]

In [None]:
#Replacing Bean Origin Value for the record with Bean Origin or Bar Name Column
chocodata['BeanOrigin'] = chocodata['BeanOrigin'].fillna(chocodata['BeanOriginBarName'])

In [None]:
chocodata['BeanOrigin'].isnull().value_counts()

In [None]:
chocodata['BeanOrigin'].sort_values().unique()

In [None]:
chocodata['BeanOrigin'].value_counts().head(10)

There are many entries in the Bean Origin Country Name with Blank as it's value. Also there are many records with Comma separated names as it's value. These are the blends in the choco beans used for the Chocolates

In [None]:
#Identifying only those with Comma separated names
chocodata[chocodata['BeanOrigin'].str.contains(',')]['BeanOrigin'].sort_values().unique()

In [None]:
chocodata[chocodata['BeanOrigin'].str.contains('/')]['BeanOrigin'].sort_values().unique()

In [None]:
chocodata[chocodata['BeanOrigin'].str.contains('&')]['BeanOrigin'].sort_values().unique()

In [None]:
chocodata[chocodata['BeanOrigin'].str.contains('\(')]['BeanOrigin'].sort_values().unique()

In [None]:
chocodata[chocodata['BeanOrigin'].str.contains('Ven$|Ven,|Venez,|Venez$')]['BeanOrigin'].sort_values().unique()

In [None]:
## Text preparation (correction) func
def txt_prep(text):
    replacements = [
        ['-', ', '], ['/ ', ', '], ['/', ', '], ['\(', ', '], [' and', ', '], [' &', ', '], ['\)', ''],
        ['Dom Rep|DR|Domin Rep|Dominican Rep,|Domincan Republic', 'Dominican Republic'],
        ['Mad,|Mad$', 'Madagascar, '],
        ['PNG', 'Papua New Guinea, '],
        ['Guat,|Guat$', 'Guatemala, '],
        ['Ven,|Ven$|Venez,|Venez$', 'Venezuela, '],
        ['Ecu,|Ecu$|Ecuad,|Ecuad$', 'Ecuador, '],
        ['Nic,|Nic$', 'Nicaragua, '],
        ['Cost Rica', 'Costa Rica'],
        ['Mex,|Mex$', 'Mexico, '],
        ['Jam,|Jam$', 'Jamaica, '],
        ['Haw,|Haw$', 'Hawaii, '],
        ['Gre,|Gre$', 'Grenada, '],
        ['Tri,|Tri$', 'Trinidad, '],
        ['C Am', 'Central America'],
        ['S America', 'South America'],
        [', $', ''], [',  ', ', '], [', ,', ', '], ['\xa0', ' '],[',\s+', ','],
        [' Bali', ',Bali']
    ]
    for i, j in replacements:
        text = re.sub(i, j, text)
    return text

In [None]:
chocodata['BeanOrigin'].str.replace('.','').apply(txt_prep).unique()

In [None]:
chocodata['BeanOrigin'] = chocodata['BeanOrigin'].str.replace('.','').apply(txt_prep)

In [None]:
chocodata['BeanOrigin'].sort_values().unique()

In [None]:
chocodata.head(10)

In [None]:
#Creating a new column to identify if the Chocolate bar is a pure variant or based on a blend
chocodata['Isblend'] = np.where(chocodata['BeanOrigin'].str.contains(','), 'Blend', 'Pure')

In [None]:
#Verifying if the data is fine in the new column
chocodata[chocodata['BeanOrigin']=='Peru,Ecuador,Venezuela'].head()

In [None]:
#Verifying if the data is fine in the new column
chocodata[chocodata['BeanOrigin']=='Venezuela'].head()

In [None]:
chocodata['Isblend'].value_counts()

***So the Final Data after data clean up looks like as follows***

In [None]:
chocodata.head()

**Visualizing the Data**

In [None]:
chocodata.describe().T

In [None]:
chocodata.dtypes

In [None]:
#f, ax = plt.subplots(figsize = (12,4), sharex=True,sharey = True)
chocodata['Rating'].plot(kind = 'hist', figsize = (12,4), bins=10)
chocodata[(chocodata['Rating'] >= 3.0)&(chocodata['Rating'] < 4)]['Rating'].plot(kind = 'hist', figsize = (12,4), bins = 2)

**The Observation here is for the entire period of time we have data for, the No. of chocolates are maximum in the rating range of 3 to 3.75**

In [None]:
k = chocodata['Isblend'].value_counts()
print(k)
chocodata['Isblend'].value_counts().plot(kind = 'Bar', figsize = (14,4))
plt.xlabel('Type of Bean used', fontsize = 14)
plt.ylabel('No. of Chocolate Bars', fontsize = 14)
plt.show()

**The Above graph shows that the Blends form only a small part of the Chocolate Bars components**

In [None]:
data1 = chocodata.groupby(by = "Isblend").Rating.mean()
print(data1)
data1.plot(kind = "bar")
plt.xlabel("Type of Bean (Blend/ Pure)", fontsize = 14)
plt.ylabel("Mean Rating", fontsize = 14)
plt.show()

**The Above plot shows that the Blend variant of Chocolates have a better rating as compared to the Chocolates with Pure cocoa beans**

In [None]:
f, ax = plt.subplots(figsize = [6,16])
sns.boxplot(data = chocodata, x = "Rating", y = "CompanyLocation")

**The Box plot shows that all the Company Locations have data with Ratings in different ranges from 3.75 to the minimum value**

In [None]:
plt.subplots(figsize = (14,4))
sns.barplot(data = chocodata.nlargest(10, "Rating"), x = "BeanOriginBarName", y = "Rating", hue = "Rating")
plt.legend(loc = "upper-left", bbox_to_anchor=(1,1))
plt.show()

**The Above graph shows that for the entire duration of the decade for which we have data, there are only 2 chocolated which have received a rating of 5, which the other chocolates managed to reach only a rating of 4**

In [None]:
#Understanding the variance in the Ratings of the Chocolates for different years
k = sns.FacetGrid(chocodata[(chocodata['ReviewDate']>=2010) & (chocodata['ReviewDate']<=2016)], row = "ReviewDate", aspect = 4)
k = k.map(plt.hist, 'Rating')

**From the above visualization, we can see that the Quality of Chocolates have been reducing in the recent years. The Rating of many of the chocolates have been decreasing from 4.0 to 3.75, 3.5 to 3.0 which is shown by the increase in the height of the bar between 3 and 3.75**

In [None]:
data1 = chocodata.groupby(by = ["Rating"]). BeanOriginBarName.nunique()
data1.columns = ["Rating", "NoofVal"]
print(data1)
data1.plot(kind = 'bar', y  = "NoofVal", x="Rating", figsize = (18,6), title = "No. of Chocolate Bars by Rating")
plt.xlabel("Year", fontsize = 14)
plt.ylabel("NoofVal", fontsize = 14)
plt.show()

**The Above graph also shows that the No. of Chocolate Bars is most in the Rating range of 3 to 3.75. This gives an indication that the Quality of Chocolates will have to be improved**

In [None]:
#Creating a Crosstab from an existing DataFrame
flow = pd.crosstab(
    chocodata['CompanyLocation'],
    chocodata['ReviewDate'],
    chocodata['Rating'], aggfunc='mean'
)

In [None]:
#replace NaN with 0
flow.fillna(value = 0, inplace = True)

In [None]:
#Creating a new column tot which is the sum of all values row level (axis = 1 refers to "by rows")
flow['tot'] = flow.mean(axis=1)

In [None]:
#Sorting the Table by tot
flow = flow.sort_values('tot', ascending=False)
flow.head()

In [None]:
#dropping the column tot so that the Locations will be sorted in ascending order
flow = flow.drop('tot', axis=1)

In [None]:
#plotting a heatmap using the flow dataset just created
fig, ax = plt.subplots(figsize=[10,6])
sns.heatmap(flow.head(20), cmap='RdBu_r', linewidths=.5)
ax.set_title('Goods Flow from Company location, mean rating by years')

**The Final Observation using Correlation shows that the Major reason for the Quality of Chocolate being low could be the Cocoa Origin Location. The Above heat map shows the Ratings have been decreasing year on year**

**------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**