In [67]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [68]:
ingr = pd.read_csv('/data/foodboost/ingredients.csv', index_col=0)
nutr = pd.read_csv('/data/foodboost/nutritions.csv', index_col=0)
recp = pd.read_csv('/data/foodboost/recipes.csv', index_col=0)
tags = pd.read_csv('/data/foodboost/tags.csv', index_col=0)

nutr_cal = nutr.loc[nutr['nutrition'] == 'energie']
nutr_eiw = nutr.loc[nutr['nutrition'] == 'eiwit']
nutr_kol = nutr.loc[nutr['nutrition'] == 'koolhydraten']
nutr_sui = nutr.loc[nutr['nutrition'] == 'waarvan suikers']
nutr_nat = nutr.loc[nutr['nutrition'] == 'natrium']
nutr_vet = nutr.loc[nutr['nutrition'] == 'vet']
nutr_zad = nutr.loc[nutr['nutrition'] == 'waarvan verzadigd']
nutr_vez = nutr.loc[nutr['nutrition'] == 'vezels']

In [69]:
nutr_new = pd.merge(nutr_cal, nutr_eiw[['recipe', 'value']], how = 'outer', left_on = 'recipe', right_on = 'recipe', suffixes = ('_calorieën','_eiwitten'))
nutr_new.fillna('0 g', inplace=True)
nutr_new = pd.merge(nutr_new, nutr_kol[['recipe', 'value']], how = 'outer', left_on = 'recipe', right_on = 'recipe')
nutr_new.fillna('0 g', inplace=True)
nutr_new = pd.merge(nutr_new, nutr_sui[['recipe', 'value']], how = 'outer', left_on = 'recipe', right_on = 'recipe', suffixes = ('_koolhydraten','_suikers'))
nutr_new.fillna('0 g', inplace=True)
nutr_new = pd.merge(nutr_new, nutr_nat[['recipe', 'value']], how = 'outer', left_on = 'recipe', right_on = 'recipe')
nutr_new.fillna('0 mg', inplace=True)
nutr_new = pd.merge(nutr_new, nutr_vet[['recipe', 'value']], how = 'outer', left_on = 'recipe', right_on = 'recipe', suffixes = ('_natrium','_vet'))
nutr_new.fillna('0 g', inplace=True)
nutr_new = pd.merge(nutr_new, nutr_zad[['recipe', 'value']], how = 'outer', left_on = 'recipe', right_on = 'recipe')
nutr_new.fillna('0 g', inplace=True)
nutr_new = pd.merge(nutr_new, nutr_vez[['recipe', 'value']], how = 'outer', left_on = 'recipe', right_on = 'recipe', suffixes = ('_verzadigd','_vezels'))
nutr_new.fillna('0 g', inplace=True)

nutr_new['calorieën'] = nutr_new['value_calorieën'].map(lambda x: int(x.rstrip(' kcal')))
nutr_new['eiwitten'] = nutr_new['value_eiwitten'].map(lambda x: float(x.rstrip(' g')))
nutr_new['koolhydraten'] = nutr_new['value_koolhydraten'].map(lambda x: float(x.rstrip(' g')))
nutr_new['suikers'] = nutr_new['value_suikers'].map(lambda x: float(x.rstrip(' g')))
nutr_new['natrium'] = nutr_new['value_natrium'].map(lambda x: float(x.rstrip(' mg')))
nutr_new['vet'] = nutr_new['value_vet'].map(lambda x: float(x.rstrip(' g')))
nutr_new['verzadigd'] = nutr_new['value_verzadigd'].map(lambda x: float(x.rstrip(' g')))
nutr_new['vezels'] = nutr_new['value_vezels'].map(lambda x: float(x.rstrip(' g')))
nutr_new = nutr_new.drop(columns=['value_calorieën', 'value_eiwitten', 'value_koolhydraten','value_suikers','value_natrium','value_vet','value_verzadigd','value_vezels','nutrition'])

nutr_new['WelofGeenNoot?'] = np.random.randint(0, 2, nutr_new.shape[0])
nutr_new['index'] = nutr_new.index
nutr_new

Unnamed: 0,recipe,calorieën,eiwitten,koolhydraten,suikers,natrium,vet,verzadigd,vezels,WelofGeenNoot?,index
0,Kruidnoten met choco-discodip,260,3.0,34.0,22.0,200.0,12.0,7.0,1.0,0,0
1,Kruidnoten in marsepein,265,3.0,43.0,34.0,120.0,9.0,4.0,1.0,0,1
2,Kruidnoten met chocodips,335,5.0,35.0,23.0,160.0,18.0,10.0,6.0,1,2
3,Pepernotentaart met marsepeinstrik,560,5.0,60.0,39.0,240.0,33.0,21.0,1.0,0,3
4,Perencake,265,5.0,36.0,21.0,120.0,11.0,2.0,1.0,1,4
...,...,...,...,...,...,...,...,...,...,...,...
8701,Paddenstoelen en courgettegratin,285,11.0,13.0,0.0,0.0,20.0,0.0,0.0,0,8701
8702,Peren-amandelcoupe,160,1.0,29.0,0.0,0.0,1.0,0.0,0.0,0,8702
8703,Ceviche met sint-jakobsschelpen,210,14.0,7.0,0.0,0.0,13.0,0.0,0.0,0,8703
8704,Pittige truffels,65,1.0,3.0,0.0,0.0,6.0,0.0,0.0,0,8704


In [62]:
pivottab = nutr_new.pivot(index='index', columns='WelofGeenNoot?', values=['calorieën','eiwitten'])
pivottab[np.isnan(pivottab)] = 0
pivottab

Unnamed: 0_level_0,calorieën,calorieën,eiwitten,eiwitten
WelofGeenNoot?,0,1,0,1
index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,260.0,0.0,3.0,0.0
1,0.0,265.0,0.0,3.0
2,0.0,335.0,0.0,5.0
3,0.0,560.0,0.0,5.0
4,0.0,265.0,0.0,5.0
...,...,...,...,...
8701,0.0,285.0,0.0,11.0
8702,160.0,0.0,1.0,0.0
8703,0.0,210.0,0.0,14.0
8704,0.0,65.0,0.0,1.0


In [63]:
pivsplit = pivottab
Xvars = ['calorieën', 'eiwitten']
X = pivsplit[Xvars]
y = pivsplit.iloc[:,0]

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [65]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [66]:
print("accuraatheid: ", metrics.accuracy_score(y_test, y_pred))

accuraatheid:  0.9919632606199771
