In [1]:
import ontospy
import pandas as pd
from ontodocs.viz.viz_html_single import *
import rdflib
import pronto
import inflection
import re
import numpy as npTrain
from tqdm import *
import json
import os
import sys
import codecs
import collections
import nltk
from lxml import etree
from lxml.html.clean import Cleaner
from pymystem3 import Mystem
from nltk.corpus import stopwords

# Анализ онтологии

Получаем элементы онтологии, которые являются терминальными и исходят из класса Wine

In [2]:
import rdflib
g = rdflib.Graph()
g.load('./wine.rdf')

Составляем фрагмент онтологии

In [None]:
classes = ["AlsatianWine", "AmericanWine", "Beaujolais", "Bordeaux", "Medoc", "Margaux", "Pauillac", "RedBordeaux", "Sauternes", "StEmilion", "WhiteBordeaux", "Burgundy", "RedBurgundy", "CotesDOr", "WhiteBurgundy", "Meursault", "CabernetFranc", "CabernetSauvignon", "CaliforniaWine", "Chardonnay", "CheninBlanc", "DessertWine", "IceWine", "SweetRiesling", "DryWine", "DryRedWine", "DryWhiteWine", "EarlyHarvest", "FrenchWine", "FullBodiedWine", "Gamay", "GermanWine", "ItalianWine", "Chianti", "LateHarvest", "IceWine", "Sauternes", "Loire", "Anjou", "Muscadet", "Sancerre", "Tours", "WhiteLoire", "Meritage", "Merlot", "PetiteSyrah", "PinotBlanc", "PinotNoir", "RedWine", "DryRedWine", "Port", "RedBordeaux", "RedBurgundy", "CotesDOr", "Riesling", "DryRiesling", "SweetRiesling", "RoseWine", "SemillonOrSauvignonBlanc", "SauvignonBlanc", "Semillon", "SweetWine", "TableWine", "RedTableWine", "WhiteTableWine", "TexasWine", "WhiteWine", "DryWhiteWine", "WhiteBordeaux", "WhiteBurgundy", "Meursault", "WhiteLoire", "WhiteNonSweetWine", "Zinfandel"]

In [None]:
whitelist_features = ["hasBody", "hasSugar", "hasFlavor", "hasColor", "madeFromGrape"]
def find_features(graph, class_name):
    features = {}
    for s, p, o in graph:
        selected_class = str(s).split("#")[-1]
        selected_feature = str(p).split("#")[-1]
        if ((class_name == selected_class) and (selected_feature in whitelist_features)):
            features[selected_feature] = str(o).split("#")[-1]
    return features

def find_instances(graph, class_name):
    instances = []
    for s, p, o in graph:
        selected_class = str(o).split("#")[-1]
        selected_feature = str(p).split("#")[-1]
        if ((class_name == selected_class) and (selected_feature == "type")):
            instances.append(str(s).split("#")[-1])
    return instances

In [None]:
instances = [find_instances(g, c) + [c] for c in classes]
instances = [i for l in instances for i in l]

In [None]:
instances

# Анализ отзывов

In [None]:
reviews = pd.read_csv("./wine-reviews/winemag-data-130k-v2.csv")

In [None]:
reviews

Какие сомелье участвовали

In [None]:
reviews.groupby("taster_name").describe()["points"][["count"]]

Какие вина описывались и как они соотносятся с онтологией

In [None]:
reviews["title"]

In [None]:
reviews.groupby("variety").describe()["points"][["count"]]

In [None]:
types = [inflection.underscore(k).split("_") for k in ontology.keys()]

In [None]:
reviews.groupby("variety").describe()["points"][["count"]].sort_values(by=["count"])

Сопоставляем виноград по названию

Сопоставляем вина по названию

In [None]:
reviews["title_array"] = reviews["title"].apply(lambda x: re.sub("[\(\)\.-]", "", x)).str.lower().str.split()

In [None]:
def jaccard(x, y):
    x = list(set(x))
    y = list(set(y))
    intersection = list(set([a for a in x if a in y]))
    union = list(set(x + y))
    return len(intersection)/len(union)

In [None]:
reviews["max_jaccard"] = reviews["title_array"].apply(lambda x: np.max([jaccard(x, y) for y in types]))
reviews["closest_type"] = reviews["title_array"].apply(lambda x: types[np.argmax([jaccard(x, y) for y in types])])

In [None]:
reviews[reviews["max_jaccard"] > 0.4][["title_array", "closest_type", "max_jaccard"]].sort_values(by=["max_jaccard"], ascending=[0])

Составляем датасет

In [78]:
ontology = instances

In [79]:
def get_features(x):
    features = []
    for k, v in x.items():
        features.append(k.replace("has", "") + "Is" + v)
    return features

In [None]:
features = list(set(f for k, v in ontology.items() for f in get_features(v)))

In [None]:
wines = list(ontology.keys())

In [None]:
features

In [None]:
links = [(k, f) for k, v in ontology.items() for f in get_features(v)]

In [None]:
wine_nodes = [{"id": wine, "group": "wine"} for wine in wines]

In [None]:
feature_nodes = [{"id": feature, "group": "feature"} for feature in features]

In [None]:
nodes = wine_nodes + feature_nodes

In [None]:
graph_links = [{"source": wine, "target": feature} for wine, feature in links]

In [None]:
graph = {"nodes": nodes, "links": graph_links}

In [None]:
json.dumps(graph)

# Чистка отзывов

In [None]:
reviews["filtered_description"] = reviews["description"].str.lower().apply(lambda x: re.sub("[^\w\s]", "", x))

In [None]:
stopwords_list = set(stopwords.words('english'))

In [None]:
reviews["filtered_description"] = reviews["filtered_description"].str.lower().str.split().apply(lambda x: [w for w in x if w not in stopwords_list])

In [None]:
reviews["filtered_description"] = reviews["filtered_description"].apply(lambda x: [w for w in x if len(w) > 2])

In [27]:
frequencies = nltk.FreqDist([w for r in reviews["filtered_description"] for w in r])

In [28]:
low_frequency = np.percentile(list(frequencies.values()), 50)

In [29]:
reviews["filtered_description"] = reviews["filtered_description"].apply(lambda x: [w for w in x if frequencies[w] > low_frequency])

In [None]:
mystem = Mystem()

In [None]:
def lemmatize_words(word_list):
    processed_word_list = []
    for word in word_list:
        word = mystem.lemmatize(word)[0]
        processed_word_list.append(word)
    return processed_word_list    

In [None]:
lemmas = []
for r in tqdm(reviews["filtered_description"]):
    lemmas.append(lemmatize_words(r))
reviews["filtered_description"] = lemmas

# Формирование выборки и классификация

In [210]:
from sklearn.model_selection import train_test_split

In [211]:
selected_reviews = reviews

In [212]:
from keras.preprocessing import text
texts = selected_reviews["filtered_description"].tolist()

In [236]:
import pdb
words = {}

def get_word_index(w):
    if w not in words.keys():
        words[w] = len(words.keys()) + 1
    return words[w]

In [215]:
def extract_feature(x, feature):
    if feature not in instances[x].keys():
        return None
    if type(instances[x][feature]) is list:
        return np.random.choice(instances[x][feature])
    return instances[x][feature]
    
def extract_flavor(x):
    return extract_feature(x, "hasColor")

selected_reviews["color"] = selected_reviews["wine_from_ontology"].apply(extract_flavor)

In [216]:
selected_reviews["color"].value_counts()

Red      61182
White    23891
Rose        38
Name: color, dtype: int64

In [217]:
y = pd.get_dummies(selected_reviews, columns=["color"])[["color_Red", "color_White", "color_Rose"]].as_matrix()

In [229]:
all_wines = selected_reviews["wine_from_ontology"].unique()
train_wines = np.random.choice(selected_reviews["wine_from_ontology"].unique(), size=int(all_wines.shape[0]*0.5))

In [231]:
train = selected_reviews[selected_reviews["wine_from_ontology"].apply(lambda x: x in train_wines)]

In [234]:
test = selected_reviews.drop(train.index)

In [237]:
X_train = [[get_word_index(w) for w in text] for text in tqdm(train["filtered_description"])]

100%|██████████| 63837/63837 [00:03<00:00, 18768.93it/s]


In [239]:
y_train = y[train.index]

In [238]:
X_test = [[get_word_index(w) for w in text] for text in tqdm(test["filtered_description"])]

100%|██████████| 66134/66134 [00:03<00:00, 19139.76it/s]


In [240]:
y_test = y[test.index]

In [241]:
# set parameters:
max_features = 20000  # vocabulary size
maxlen = 100  # maximum length of the review
batch_size = 32
embedding_dims = 20
ngram_filters = [3, 5, 7]
nb_filter = 1200  # number of filters for each ngram_filter
nb_epoch = 5

Преобразовать к bag of words каждое предложение

In [242]:
'''This scripts implements Kim's paper "Convolutional Neural Networks for Sentence Classification"
with a very small embedding size (20) than the commonly used values (100 - 300) as it gives better
result with much less parameters.

Run on GPU: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_cnn.py

Get to 0.853 test accuracy after 5 epochs. 13s/epoch on Nvidia GTX980 GPU.
'''

from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Input, Add
from keras.layers.core import Dropout, Activation, Flatten, Dense
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D

# prepare data
print('Loading data...')
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Loading data...
63837 train sequences
66134 test sequences
Pad sequences (samples x time)
X_train shape: (63837, 100)
X_test shape: (66134, 100)


In [243]:
input_layer =  Input(shape=(maxlen,))
embeddings = Embedding(max_features, embedding_dims, input_length=maxlen)(input_layer)
dropout = Dropout(0.)(embeddings)
outs = []
for n_gram in ngram_filters:
    convolution = Convolution1D(filters=nb_filter,
                                 kernel_size=n_gram,
                                 padding='valid',
                                 activation='relu',
                                 strides=1,
                                 input_shape=(embedding_dims, maxlen))(dropout)
    pooling = MaxPooling1D(pool_size=maxlen - n_gram + 1)(convolution)
    flatten = Flatten()(pooling)
    outs.append(flatten)
added = Add()(outs) 
dropout = Dropout(0.)(added)
dense = Dense(3, input_dim=nb_filter * len(ngram_filters))(dropout)
activation = Activation('sigmoid')(dense)

model = Model(inputs=input_layer, outputs=activation)

In [244]:
# train model
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=nb_epoch,
          validation_data=(X_test, y_test),)
# acc = accuracy(y_test,
#                np.round(np.array(model.predict({'input': X_test},
#                                                batch_size=batch_size)['output'])))
# print('Test accuracy:', acc)

Train on 63837 samples, validate on 66134 samples
Epoch 1/5
Epoch 2/5
  800/63837 [..............................] - ETA: 43:04 - loss: 0.1669 - acc: 0.9412

KeyboardInterrupt: 

Можно взять несколько отзывов

In [263]:
train_wine_colors = {}

In [267]:
train_wine_colors

{'Anjou': 'Red',
 'CabernetSauvignon': 'Red',
 'Chardonnay': 'White',
 'CheninBlanc': 'White',
 'Chianti': 'Red',
 'DessertWine': 'White',
 'DryRedWine': 'Red',
 'EarlyHarvest': 'Red',
 'FrenchWine': 'Red',
 'Gamay': 'Red',
 'ItalianWine': 'Red',
 'Loire': 'White',
 'Margaux': 'Red',
 'Muscadet': 'White',
 'PetiteSyrah': 'Red',
 'RedBordeaux': 'Red',
 'Riesling': 'White',
 'SemillonOrSauvignonBlanc': 'White',
 'StEmilion': 'Red',
 'SweetRiesling': 'White',
 'TexasWine': 'Red',
 'Tours': 'Red',
 'WhiteBordeaux': 'White'}

In [264]:
for w in train_wines:
    wine_reviews = reviews[reviews["wine_from_ontology"] == w]["filtered_description"][0:5]
    wine_reviews_texts = [[get_word_index(w) for w in text] for text in wine_reviews]
    wine_reviews_padded_texts = sequence.pad_sequences(wine_reviews_texts, maxlen=maxlen)
    train_wine_colors[w] = ["Red", "White", "Rose"][np.argmax(model.predict(wine_reviews_padded_texts).mean(axis=0))]

In [265]:
test_wine_colors = {}

In [268]:
test_wine_colors

{'AlsatianWine': 'White',
 'AmericanWine': 'Red',
 'Beaujolais': 'Red',
 'Bordeaux': 'White',
 'Burgundy': 'White',
 'CabernetFranc': 'Red',
 'CaliforniaWine': 'Red',
 'CotesDOr': 'White',
 'DryRiesling': 'White',
 'DryWhiteWine': 'White',
 'DryWine': 'White',
 'GermanWine': 'White',
 'IceWine': 'White',
 'LateHarvest': 'White',
 'Meritage': 'Red',
 'Merlot': 'Red',
 'Meursault': 'White',
 'Pauillac': 'Red',
 'PinotBlanc': 'White',
 'PinotNoir': 'Red',
 'Port': 'Red',
 'RedBurgundy': 'Red',
 'RedTableWine': 'Red',
 'RedWine': 'Red',
 'RoseWine': 'Red',
 'Sancerre': 'White',
 'Sauternes': 'White',
 'SauvignonBlanc': 'White',
 'Semillon': 'White',
 'SweetWine': 'White',
 'TableWine': 'Red',
 'WhiteLoire': 'White',
 'WhiteNonSweetWine': 'White',
 'WhiteWine': 'Red',
 'Zinfandel': 'Red'}

In [266]:
for w in test["wine_from_ontology"].unique():
    wine_reviews = reviews[reviews["wine_from_ontology"] == w]["filtered_description"][0:5]
    wine_reviews_texts = [[get_word_index(w) for w in text] for text in wine_reviews]
    wine_reviews_padded_texts = sequence.pad_sequences(wine_reviews_texts, maxlen=maxlen)
    test_wine_colors[w] = ["Red", "White", "Rose"][np.argmax(model.predict(wine_reviews_padded_texts).mean(axis=0))]

In [245]:
train_answers = model.predict(X_train)

KeyboardInterrupt: 

In [None]:
test_answers = model.predict(X_test)

In [98]:
padded_X = sequence.pad_sequences(X, maxlen=maxlen)

In [None]:
predicted_y = model.predict(X_test)

In [None]:
answers = np.array(["Red", "White", "Rose"])[np.argmax(predicted_y, axis=1)]

In [None]:
answers

In [246]:
model.save('color_classification2.h5')

Признаки, связанные с ароматом

# Создать онтологию на основе полученной модели

In [54]:
whitelist_features = ["hasBody", "hasSugar", "hasFlavor", "hasColor", "madeFromGrape", 'locatedIn', 'hasMaker']
def find_features(graph, class_name, parent_name):
    features = {}
    for s, p, o in graph:
        selected_class = str(s).split("#")[-1]
        selected_feature = str(p).split("#")[-1]
        if ((class_name == selected_class) and (selected_feature in whitelist_features)):
            features[selected_feature] = str(o).split("#")[-1]
    features['parent'] = parent_name
    features['new'] = True
    return features

def find_instances(graph, class_name):
    instances = []
    for s, p, o in graph:
        selected_class = str(o).split("#")[-1]
        selected_feature = str(p).split("#")[-1]
        if ((class_name == selected_class) and (selected_feature == "type")):
            instances.append(str(s).split("#")[-1])
    return instances

In [55]:
instances = {}

In [56]:
for k, v in start_ontology.items():
    instances[k] = v

In [57]:
error_features = ['madeFromGrape', 'locatedIn', 'hasMaker']
def preprocess_instance(instance):
    if "new" in instance.keys():
        for feature in error_features:
            if feature in instance.keys():
                instance[feature] = " ".join([inflection.titleize(w) for w in inflection.underscore(instance[feature].replace("Region", "")).split("_")])

In [58]:
for k, i in instances.items():
    preprocess_instance(i)

- Обработка массива родителей
    - Пропагейт по каждому
    - Каждому увеличиваем вложенность
- Обработка массива признаков
    - Матчинг: 
        - Сорт винограда - обычный матчинг
        - Регион - страна, провинция, регион1, регион2
        - Винодельная - обычный матчинг
    - Параметры выборки: выбираем случайный признак

In [59]:
# Проп
    # Если есть уровень вложенности - пропускаем
    # Для каждого родителя
        # Вызываем проп
        # Копируем свойства
        # Увеличиваем уровень вложенности

def propagate_features(instance, instances):
#     if not "level" in instance.keys():
    instance["level"] = 0
    parents = []
    if 'parent' in instance.keys():
        parents = instance['parent']
    if type(parents) is not list:
        parents = [parents]
    for p in parents:
        if p is not "wine":
            propagate_features(instances[p], instances)
            for k, v in instances[p].items():
                if k not in instance.keys():
                    instance[k] = instances[p][k]
            instance['level'] = max(instance['level'], instances[p]['level'] + 1)

In [60]:
# Для каждого узла
    # Вызываем проп
for k, v in instances.items():
    propagate_features(v, instances)

In [61]:
for k, v in instances.items():
    if 'madeFromGrape' in v.keys():
        if type(v['madeFromGrape']) is list:
            v['madeFromGrape'] = set(v['madeFromGrape'])

In [62]:
for k, v in instances.items():
    v['name_array'] = inflection.underscore(k).split("_")

In [63]:
def compare_variety(instance, row):
    if 'madeFromGrape' not in instance.keys():
        return 0
    if type(instance["madeFromGrape"]) is list:
        return int(row["variety"] in instance["madeFromGrape"]) / len(instance["madeFromGrape"])
    else:
        return int(row["variety"] == instance["madeFromGrape"])

def compare_region(instance, row):
    if 'locatedIn' not in instance.keys():
        return 0
    return int((row["region_1"] == instance["locatedIn"]) or \
            (row["region_2"] == instance["locatedIn"]) or \
            (row["country"] == instance["locatedIn"]) or \
            (row["province"] == instance["locatedIn"]))

def compare_maker(instance, row):
    if 'hasMaker' not in instance.keys():
        return 0
    return int(row['winery'] == instance['hasMaker'])

def compare_name(instance, row):
    return jaccard(instance["name_array"], row["title_array"])

# Для каждого узла
    # Сравниваем сорт
    # Сравниваем регион
    # Сравниваем винодельную
    # Сравниваем название
    # Выбираем самый вложенный, если совпадают
    
wine_from_ontology = []
for i in tqdm(reviews.index):
    r = reviews.loc[i]
    wine_points = []
    wines = []
    levels = []
    for k, v in instances.items():
        points = compare_variety(v, r) + compare_region(v, r) + compare_maker(v, r) + compare_name(v, r)
        wine_points.append(points)
        wines.append(k)
        levels.append(v['level'])
    
    wine_from_ontology.append(wines[np.argmax(wine_points)])

  0%|          | 0/129971 [00:00<?, ?it/s]


NameError: name 'jaccard' is not defined

In [None]:
reviews["wine_from_ontology"] = wine_from_ontology
reviews["points"] = points

In [None]:
reviews.to_csv("reviews_with_matching.csv")

# Использование модели для получения признаков

In [23]:
reviews = pd.read_csv("reviews_with_matching.csv")

In [24]:
from keras.models import load_model
model = load_model('color_classification.h5')

In [25]:
reviews["filtered_description"] = reviews["filtered_description"].apply(lambda x: eval(x))

In [30]:
texts = reviews["filtered_description"].tolist()

In [31]:
import pdb
words = {}

def get_word_index(w):
    if w not in words.keys():
        words[w] = len(words.keys()) + 1
    return words[w]

In [32]:
X = [[get_word_index(w) for w in text] for text in tqdm(texts)]

100%|██████████| 129971/129971 [00:06<00:00, 18793.19it/s]


In [35]:
from keras.preprocessing import sequence
X_padded = sequence.pad_sequences(X, maxlen=maxlen)

In [36]:
batches = []

In [37]:
for i in tqdm(range(int(len(X_padded) / batch_size))):
    batches.append(model.predict(X_padded[i*batch_size:(i+1)*batch_size]))

100%|██████████| 4061/4061 [30:38<00:00,  2.21it/s]


In [None]:
answers = np.array(["Red", "White", "Rose"])[np.argmax(predicted_y, axis=1)]

In [40]:
np.save('batches.npy', batches)

In [45]:
batches.append(model.predict(X_padded[len(batches) * batch_size:]))

In [46]:
batches = [x for l in batches for x in l]

In [48]:
predicted_y = batches

In [49]:
np.save('predicted_y.npy', predicted_y)

In [67]:
reviews["guessed_color"] = np.array(["Red", "White", "Rose"])[np.argmax(predicted_y, axis=1)]

In [68]:
reviews["guessed_color"]

0           Red
1           Red
2         White
3         White
4           Red
5           Red
6           Red
7           Red
8         White
9           Red
10          Red
11          Red
12          Red
13          Red
14        White
15        White
16          Red
17          Red
18          Red
19          Red
20          Red
21          Red
22        White
23          Red
24          Red
25          Red
26        White
27          Red
28          Red
29        White
          ...  
129941    White
129942      Red
129943      Red
129944      Red
129945      Red
129946    White
129947      Red
129948      Red
129949    White
129950      Red
129951      Red
129952      Red
129953      Red
129954      Red
129955    White
129956      Red
129957      Red
129958      Red
129959    White
129960      Red
129961      Red
129962      Red
129963      Red
129964    White
129965      Red
129966    White
129967      Red
129968      Red
129969      Red
129970      Red
Name: guessed_color, Len

In [73]:
matching = pd.get_dummies(reviews, columns=["guessed_color"])[["wine_from_ontology", "guessed_color_Red", "guessed_color_White"]]

In [77]:
matching = matching.groupby("wine_from_ontology").mean()

In [65]:
def extract_feature(x, feature):
    if feature not in instances[x].keys():
        return None
    if type(instances[x][feature]) is list:
        return np.random.choice(instances[x][feature])
    return instances[x][feature]
    
def extract_flavor(x):
    return extract_feature(x, "hasColor")

reviews["color"] = reviews["wine_from_ontology"].apply(extract_flavor)

# Визуализация для выборки

In [208]:
set_wines = list(reviews.sample(frac=0.10)["wine_from_ontology"].unique())
all_wines = list(reviews.sample(frac=0.10)["wine_from_ontology"].unique())
difference = [w for w in set_wines if w not in all_wines]
len(difference)

7

In [209]:
difference

['RedBordeaux',
 'SemillonOrSauvignonBlanc',
 'IceWine',
 'SweetWine',
 'WhiteLoire',
 'WhiteNonSweetWine',
 'DryRedWine']

In [97]:
guessed_color_matching = pd.get_dummies(reviews, columns=["guessed_color"])[["wine_from_ontology", "guessed_color_Red", "guessed_color_White"]]
guessed_color_matching = guessed_color_matching.groupby("wine_from_ontology").mean()

In [98]:
color_matching = pd.get_dummies(reviews, columns=["color"])[["wine_from_ontology", "color_Red", "color_White"]]
color_matching = color_matching.groupby("wine_from_ontology").mean()

In [127]:
objects = [{"id": c, "type": "wine"} for c in guessed_color_matching.index]

In [128]:
features = [{"id": "Red", "type": "feature"}, {"id": "White", "type": "feature"}]

In [129]:
links = []

In [130]:
for i, r in color_matching.iterrows():
    if (r.sum() > 0):
        links.append({
            "source": i,
            "target": np.argmax(r).replace("color_", ""),
            "type": "original"
        })

In [131]:
for i, r in guessed_color_matching.iterrows():
    links.append({
        "source": i,
        "target": np.argmax(r).replace("guessed_color_", ""),
        "type": "guessed"
    })

In [132]:
graph = {
    "nodes": list(objects) + list(features),
    "links": links
}

In [133]:
import json
json.dumps(graph)

'{"links": [{"target": "Red", "type": "original", "source": "Beaujolais"}, {"target": "Red", "type": "original", "source": "CabernetFranc"}, {"target": "Red", "type": "original", "source": "CabernetSauvignon"}, {"target": "White", "type": "original", "source": "Chardonnay"}, {"target": "White", "type": "original", "source": "CheninBlanc"}, {"target": "Red", "type": "original", "source": "Chianti"}, {"target": "Red", "type": "original", "source": "CotesDOr"}, {"target": "Red", "type": "original", "source": "DryRedWine"}, {"target": "White", "type": "original", "source": "DryRiesling"}, {"target": "White", "type": "original", "source": "DryWhiteWine"}, {"target": "White", "type": "original", "source": "IceWine"}, {"target": "Red", "type": "original", "source": "Margaux"}, {"target": "Red", "type": "original", "source": "Meritage"}, {"target": "Red", "type": "original", "source": "Merlot"}, {"target": "White", "type": "original", "source": "Meursault"}, {"target": "Red", "type": "original

In [124]:
links

[{'source': 'Beaujolais', 'target': 'Red', 'type': 'original'},
 {'source': 'CabernetFranc', 'target': 'Red', 'type': 'original'},
 {'source': 'CabernetSauvignon', 'target': 'Red', 'type': 'original'},
 {'source': 'Chardonnay', 'target': 'White', 'type': 'original'},
 {'source': 'CheninBlanc', 'target': 'White', 'type': 'original'},
 {'source': 'Chianti', 'target': 'Red', 'type': 'original'},
 {'source': 'CotesDOr', 'target': 'Red', 'type': 'original'},
 {'source': 'DryRedWine', 'target': 'Red', 'type': 'original'},
 {'source': 'DryRiesling', 'target': 'White', 'type': 'original'},
 {'source': 'DryWhiteWine', 'target': 'White', 'type': 'original'},
 {'source': 'IceWine', 'target': 'White', 'type': 'original'},
 {'source': 'Margaux', 'target': 'Red', 'type': 'original'},
 {'source': 'Meritage', 'target': 'Red', 'type': 'original'},
 {'source': 'Merlot', 'target': 'Red', 'type': 'original'},
 {'source': 'Meursault', 'target': 'White', 'type': 'original'},
 {'source': 'Pauillac', 'target'

In [295]:
wines = []

In [296]:
for w in train_wine_colors.keys():
    wines.append({
        "id": w,
        "group": "wine"
    })

In [297]:
for w in test_wine_colors.keys():
    wines.append({
        "id": w,
        "group": "wine",
        "new": True
    })

In [298]:
wines.append({
    "id": "Red",
    "group": "feature"
})

In [299]:
wines.append({
    "id": "White",
    "group": "feature"
})

In [306]:
links = []

In [307]:
# for k, v in train_wine_colors.items():
#     links.append({
#         "source": k,
#         "target": v,
#         "group": "original"
#     })

In [308]:
for i, r in reviews[["wine_from_ontology", "color"]].drop_duplicates().iterrows():
    if r["color"]:
        links.append({
            "source": r["wine_from_ontology"],
            "target": r["color"],
            "group": "original"
        })

In [309]:
for k, v in test_wine_colors.items():
    links.append({
        "source": k,
        "target": v,
        "group": "guessed"
    })

In [310]:
graph = {
    "nodes": wines,
    "links": links
}

In [311]:
json.dumps(graph)

'{"links": [{"target": "Red", "source": "Port", "group": "original"}, {"target": "White", "source": "Riesling", "group": "original"}, {"target": "Red", "source": "PinotNoir", "group": "original"}, {"target": "Red", "source": "Margaux", "group": "original"}, {"target": "Red", "source": "CabernetSauvignon", "group": "original"}, {"target": "White", "source": "Chardonnay", "group": "original"}, {"target": "Red", "source": "Merlot", "group": "original"}, {"target": "White", "source": "CheninBlanc", "group": "original"}, {"target": "Red", "source": "Beaujolais", "group": "original"}, {"target": "White", "source": "SauvignonBlanc", "group": "original"}, {"target": "Red", "source": "Chianti", "group": "original"}, {"target": "Red", "source": "CabernetFranc", "group": "original"}, {"target": "Red", "source": "Zinfandel", "group": "original"}, {"target": "White", "source": "DryRiesling", "group": "original"}, {"target": "White", "source": "PinotBlanc", "group": "original"}, {"target": "Red", "s