In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [2]:
choko = pd.read_csv('./flavors_of_cacao.csv')

In [3]:
original_colnames = choko.columns
new_colnames = ['company', 'species', 'REF', 'review_year', 'cocoa_p',
                'company_location', 'rating', 'bean_typ', 'country']
choko = choko.rename(columns=dict(zip(original_colnames, new_colnames)))
## And modify data types
choko['cocoa_p'] = choko['cocoa_p'].str.replace('%','').astype(float)/100
choko['country'] = choko['country'].fillna(choko['species'])

In [4]:
def txt_prep(text):
    replacements = [
        ['-', ', '], ['/ ', ', '], ['/', ', '], ['\(', ', '], [' and', ', '], [' &', ', '], ['\)', ''],
        ['Dom Rep|DR|Domin Rep|Dominican Rep,|Domincan Republic', 'Dominican Republic'],
        ['Mad,|Mad$', 'Madagascar, '],
        ['PNG', 'Papua New Guinea, '],
        ['Guat,|Guat$', 'Guatemala, '],
        ['Ven,|Ven$|Venez,|Venez$', 'Venezuela, '],
        ['Ecu,|Ecu$|Ecuad,|Ecuad$', 'Ecuador, '],
        ['Nic,|Nic$', 'Nicaragua, '],
        ['Cost Rica', 'Costa Rica'],
        ['Mex,|Mex$', 'Mexico, '],
        ['Jam,|Jam$', 'Jamaica, '],
        ['Haw,|Haw$', 'Hawaii, '],
        ['Gre,|Gre$', 'Grenada, '],
        ['Tri,|Tri$', 'Trinidad, '],
        ['C Am', 'Central America'],
        ['S America', 'South America'],
        [', $', ''], [',  ', ', '], [', ,', ', '], ['\xa0', ' '],[',\s+', ','],
        [' Bali', ',Bali']
    ]
    for i, j in replacements:
        text = re.sub(i, j, text)
    return text

In [5]:
choko['country'] = choko['country'].str.replace('.', '').apply(txt_prep)

In [6]:
choko['company_location'] = choko['company_location']\
.str.replace('Amsterdam', 'Holland')\
.str.replace('U.K.', 'England')\
.str.replace('Niacragua', 'Nicaragua')\
.str.replace('Domincan Republic', 'Dominican Republic')

In [7]:
## Let's define blend feature
choko['is_blend'] = np.where(
    np.logical_or(
        np.logical_or(choko['species'].str.lower().str.contains(',|(blend)|;'),
                      choko['country'].str.len() == 1),
        choko['country'].str.lower().str.contains(',')
    )
    , 1
    , 0
)
## How many blends/pure cocoa?
choko['is_blend'].value_counts()

  after removing the cwd from sys.path.


0    1096
1     699
Name: is_blend, dtype: int64

In [8]:
choko['is_domestic'] = np.where(choko['country'] == choko['company_location'], 1, 0)
choko['is_domestic'].value_counts()

0    1590
1     205
Name: is_domestic, dtype: int64

In [9]:
choko.columns

Index(['company', 'species', 'REF', 'review_year', 'cocoa_p',
       'company_location', 'rating', 'bean_typ', 'country', 'is_blend',
       'is_domestic'],
      dtype='object')

In [10]:
choko = choko.drop(columns=['bean_typ'])

In [11]:
choko.dtypes

company              object
species              object
REF                   int64
review_year           int64
cocoa_p             float64
company_location     object
rating              float64
country              object
is_blend              int64
is_domestic           int64
dtype: object

In [12]:
label = ['rating']
y = choko[label]


In [13]:
choko = choko.astype({'REF':object, 'review_year':object})

In [14]:
cat_columns = ['company', 'species', 'REF', 'review_year', 'company_location', 'country', 'is_blend', 'is_domestic']
num_columns = ['cocoa_p']

In [15]:
one_hot_cat_df = pd.get_dummies(choko[cat_columns])

In [16]:
choko_df = pd.concat([one_hot_cat_df, choko[num_columns]], axis=1)
y = choko['rating'].values

In [17]:
test_size = 0.2
train_size = 1 - test_size
split_row = int(train_size * choko.shape[0])
x_train = choko_df.loc[0:split_row,:]
x_test = choko_df.loc[split_row + 1:,:].values


In [24]:
y_train = y[0:split_row]
y_test = y[split_row + 1:]

In [26]:
y_test

array([3.75, 3.  , 2.  , 3.5 , 2.  , 2.  , 3.  , 2.75, 2.75, 3.25, 3.5 ,
       2.75, 3.5 , 3.  , 3.25, 2.  , 2.5 , 3.25, 3.5 , 3.25, 3.5 , 2.5 ,
       3.  , 3.25, 3.25, 3.  , 3.5 , 3.5 , 3.75, 3.  , 3.  , 3.  , 3.25,
       3.25, 3.75, 3.75, 3.75, 2.75, 3.  , 3.5 , 3.75, 4.  , 3.75, 3.  ,
       3.75, 3.25, 3.25, 3.5 , 3.75, 3.75, 4.  , 2.75, 3.5 , 3.  , 3.75,
       1.5 , 3.  , 3.25, 3.5 , 3.  , 2.75, 2.75, 3.25, 3.25, 3.  , 3.  ,
       3.  , 3.25, 3.25, 3.5 , 3.5 , 3.5 , 3.5 , 3.75, 3.75, 4.  , 3.25,
       3.5 , 4.  , 3.5 , 3.75, 3.25, 3.5 , 3.75, 3.25, 3.75, 3.25, 2.75,
       3.75, 3.25, 3.75, 4.  , 4.  , 4.  , 3.25, 4.  , 3.25, 3.75, 4.  ,
       4.  , 3.5 , 3.75, 3.5 , 3.5 , 2.75, 3.5 , 4.  , 4.  , 3.75, 3.5 ,
       3.75, 3.  , 3.5 , 3.  , 3.5 , 3.75, 3.75, 3.25, 3.5 , 3.25, 3.5 ,
       3.5 , 3.25, 3.25, 3.5 , 3.  , 3.  , 3.25, 3.75, 3.5 , 2.75, 3.5 ,
       3.75, 3.75, 2.75, 3.  , 3.  , 2.25, 3.  , 3.  , 3.  , 3.  , 3.  ,
       2.75, 3.25, 3.5 , 3.5 , 2.5 , 2.5 , 2.75, 2.