In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor

chocolate = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-18/chocolate.csv')

#remove percentage and convert to float
chocolate['cocoa_percent'] = chocolate['cocoa_percent'].str[:-1].astype(float) 

#get the number of ingredients and remove null values
chocolate['n_ingredients'] = chocolate['ingredients'].str[0]
chocolate = chocolate[~chocolate['ingredients'].isna()]
chocolate['n_ingredients'] = chocolate['n_ingredients'].astype(int)

#create a column if the chocolate contains sugar
chocolate['ingredients'] = chocolate['ingredients'].str[2:].str.strip()
chocolate['ingredients'] = chocolate['ingredients'].str[2:]
chocolate['contains_S'] = (chocolate['ingredients'].str.split(',').str[0]=='S').astype(int)

#create a column if the chocolate contains other sweeteners
chocolate.loc[chocolate['ingredients'].str.split(',').str[0]=='S', 'ingredients'] = chocolate['ingredients'].str[2:]
chocolate['contains_S*'] = (chocolate['ingredients'].str.split(',').str[0]=='S*').astype(int)

#create a column if the chocolate cointains Cocoa Butter
chocolate.loc[chocolate['ingredients'].str.split(',').str[0]=='S*', 'ingredients'] = chocolate['ingredients'].str[3:]
chocolate['contains_C'] = (chocolate['ingredients'].str.split(',').str[0]=='C').astype(int)

#create a column if the chocolate contains Vanilla
chocolate.loc[chocolate['ingredients'].str.split(',').str[0]=='C', 'ingredients'] = chocolate['ingredients'].str[2:]
chocolate['contains_V'] = (chocolate['ingredients'].str.split(',').str[0]=='V').astype(int)

#create a column if the chocolate contains Lecithin
chocolate.loc[chocolate['ingredients'].str.split(',').str[0]=='V', 'ingredients'] = chocolate['ingredients'].str[2:]
chocolate['contains_L'] = (chocolate['ingredients'].str.split(',').str[0]=='L').astype(int)

#create a column if the chocolate contains Salt
chocolate.loc[chocolate['ingredients'].str.split(',').str[0]=='L', 'ingredients'] = chocolate['ingredients'].str[2:]
chocolate['contains_Sa'] = (chocolate['ingredients'].str.split(',').str[0]=='Sa').astype(int)

In [2]:
chocolate

Unnamed: 0,ref,company_manufacturer,company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,ingredients,most_memorable_characteristics,rating,n_ingredients,contains_S,contains_S*,contains_C,contains_V,contains_L,contains_Sa
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,,"rich cocoa, fatty, bready",3.25,3,1,0,1,0,0,0
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,,"cocoa, vegetal, savory",3.50,3,1,0,1,0,0,0
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,,"cocoa, blackberry, full body",3.75,3,1,0,1,0,0,0
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68.0,,"chewy, off, rubbery",3.00,3,1,0,1,0,0,0
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72.0,,"fatty, earthy, moss, nutty,chalky",3.00,3,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1205,Zotter,Austria,2014,Blend,Raw,80.0,Sa,"waxy, cloying, vegetal",2.75,4,0,1,1,0,0,1
2526,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75.0,,"strong nutty, marshmallow",3.75,3,1,0,1,0,0,0
2527,2036,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75.0,,"fatty, earthy, cocoa",3.00,3,1,0,1,0,0,0
2528,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70.0,,"fatty, mild nuts, mild fruit",3.25,3,1,0,1,0,0,0


In [3]:
chocolate.columns

Index(['ref', 'company_manufacturer', 'company_location', 'review_date',
       'country_of_bean_origin', 'specific_bean_origin_or_bar_name',
       'cocoa_percent', 'ingredients', 'most_memorable_characteristics',
       'rating', 'n_ingredients', 'contains_S', 'contains_S*', 'contains_C',
       'contains_V', 'contains_L', 'contains_Sa'],
      dtype='object')

In [4]:
chocolate.dtypes

ref                                   int64
company_manufacturer                 object
company_location                     object
review_date                           int64
country_of_bean_origin               object
specific_bean_origin_or_bar_name     object
cocoa_percent                       float64
ingredients                          object
most_memorable_characteristics       object
rating                              float64
n_ingredients                         int32
contains_S                            int32
contains_S*                           int32
contains_C                            int32
contains_V                            int32
contains_L                            int32
contains_Sa                           int32
dtype: object

In [5]:
len(pd.unique(chocolate['company_manufacturer']))

542

In [6]:
chocolate.groupby(['company_manufacturer'])['rating'].mean().sort_values(ascending=True)

company_manufacturer
Ki' Xocolatl                            2.000000
Jacque Torres                           2.000000
Majani                                  2.000000
Love Bar                                2.000000
Casa                                    2.000000
                                          ...   
Idilio (Felchlin)                       3.775000
Patric                                  3.791667
Matale                                  3.812500
Heirloom Cacao Preservation (Zokoko)    3.875000
Ocelot                                  3.875000
Name: rating, Length: 542, dtype: float64

In [20]:
chocolate.groupby(['company_location'])['rating'].mean().sort_values(ascending=True)

company_location
Puerto Rico              2.625
India                    2.625
Wales                    2.750
Vanuatu                  2.750
St.Vincent-Grenadines    2.750
                         ...  
U.A.E.                   3.400
Sao Tome                 3.500
Poland                   3.500
Argentina                3.500
Chile                    3.750
Name: rating, Length: 67, dtype: float64

In [24]:
chocolate.groupby(['review_date'])['rating'].mean()

review_date
2006    3.062500
2007    3.166667
2008    3.038690
2009    3.088496
2010    3.188830
2011    3.271242
2012    3.206944
2013    3.214689
2014    3.197531
2015    3.243750
2016    3.238263
2017    3.364078
2018    3.191111
2019    3.134715
2020    3.256173
2021    3.320000
Name: rating, dtype: float64

In [25]:
chocolate.groupby(['country_of_bean_origin'])['rating'].mean().sort_values(ascending=True)

country_of_bean_origin
Puerto Rico              2.714286
Martinique               2.750000
St.Vincent-Grenadines    2.750000
Sierra Leone             2.750000
Principe                 2.750000
                           ...   
Thailand                 3.300000
Congo                    3.318182
Solomon Islands          3.450000
Sao Tome & Principe      3.500000
China                    3.500000
Name: rating, Length: 62, dtype: float64

In [18]:
# train test split
X = chocolate[['cocoa_percent', 'n_ingredients', 'contains_S', 'contains_S*', 'contains_C',
       'contains_V', 'contains_L', 'contains_Sa']]
y = chocolate['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

reg = KNeighborsRegressor(n_neighbors=40)

reg.fit(X_train, y_train)

y_test_pred = reg.predict(X_test)

In [19]:
from sklearn.metrics import r2_score
r2_score(y_test, y_test_pred)

0.023327648901971476