In [None]:
import ontospy
import pandas as pd
import numpy as np
from ontodocs.viz.viz_html_single import *
import rdflib
import pronto
import inflection
import re
import numpy as npTrain
from tqdm import *
import json
import os
import sys
import codecs
import collections
import nltk
from lxml import etree
from lxml.html.clean import Cleaner
from pymystem3 import Mystem
from nltk.corpus import stopwords
import pdb
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Input, Add
from keras.layers.core import Dropout, Activation, Flatten, Dense
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.models import load_model

- Читаем онтологию из файла и расширяем ее
- Матчим онтологию и датасет
- Подготавливаем датасет
- Строим модель классификации текста
- Сохраняем результаты

# Анализ данных

In [None]:
reviews = pd.read_csv("./wine-reviews/winemag-data-130k-v2.csv")

In [None]:
reviews["title_array"] = reviews["title"].apply(lambda x: re.sub("[\(\)\.-]", "", x)).str.lower().str.split()

# Анализ онтологии

In [None]:
instances = json.load(open('wine_ontology.json'))

Копируем свойства родителей в каждого ребенка:
    - Если есть уровень вложенности - пропускаем
    - Для каждого родителя
        - Вызываем проп
        - Копируем свойства
        - Увеличиваем уровень вложенности

In [None]:
def propagate_features(instance, instances):
#     if not "level" in instance.keys():
    instance["level"] = 0
    parents = []
    if 'parent' in instance.keys():
        parents = instance['parent']
    if type(parents) is not list:
        parents = [parents]
    for p in parents:
        if not (p == "wine"):
            propagate_features(instances[p], instances)
            for k, v in instances[p].items():
                if k not in instance.keys():
                    instance[k] = instances[p][k]
            instance['level'] = max(instance['level'], instances[p]['level'] + 1)

In [None]:
for k, v in instances.items():
    propagate_features(v, instances)

Преобразовать признак madeFromGrape (для ускорения обработки)

In [None]:
for k, v in instances.items():
    if 'madeFromGrape' in v.keys():
        if type(v['madeFromGrape']) is list:
            v['madeFromGrape'] = set(v['madeFromGrape'])

Сформировать фичу для дальнейшего матчинга имени

In [None]:
for k, v in instances.items():
    v['name_array'] = inflection.underscore(k).split("_")

Сматчить вина из онтологии и вина из отзывов

In [None]:
def jaccard(x, y):
    x = list(set(x))
    y = list(set(y))
    intersection = list(set([a for a in x if a in y]))
    union = list(set(x + y))
    return len(intersection)/len(union)

In [None]:
def compare_variety(instance, row):
    if 'madeFromGrape' not in instance.keys():
        return 0
    if type(instance["madeFromGrape"]) is list:
        return int(row["variety"] in instance["madeFromGrape"]) / len(instance["madeFromGrape"])
    else:
        return int(row["variety"] == instance["madeFromGrape"])

def compare_region(instance, row):
    if 'locatedIn' not in instance.keys():
        return 0
    return int((row["region_1"] == instance["locatedIn"]) or \
            (row["region_2"] == instance["locatedIn"]) or \
            (row["country"] == instance["locatedIn"]) or \
            (row["province"] == instance["locatedIn"]))

def compare_maker(instance, row):
    if 'hasMaker' not in instance.keys():
        return 0
    return int(row['winery'] == instance['hasMaker'])

def compare_name(instance, row):
    return jaccard(instance["name_array"], row["title_array"])

In [None]:
wine_from_ontology = []
for i in tqdm(reviews.index):
    r = reviews.loc[i]
    wine_points = []
    wines = []
    levels = []
    for k, v in instances.items():
        points = compare_variety(v, r) + compare_region(v, r) + compare_maker(v, r) + compare_name(v, r)
        wine_points.append(points)
        wines.append(k)
        levels.append(v['level'])
    
    wine_from_ontology.append(wines[np.argmax(wine_points)])

In [None]:
reviews["wine_from_ontology"] = wine_from_ontology

In [None]:
reviews.to_csv("reviews_with_matching.csv")

TODO Улучшить качество матчинга

# Чистка отзывов

In [None]:
reviews["filtered_description"] = reviews["description"].str.lower().apply(lambda x: re.sub("[^\w\s]", "", x))

In [None]:
stopwords_list = set(stopwords.words('english'))

In [None]:
reviews["filtered_description"] = reviews["filtered_description"].str.lower().str.split().apply(lambda x: [w for w in x if w not in stopwords_list])

In [None]:
reviews["filtered_description"] = reviews["filtered_description"].apply(lambda x: [w for w in x if len(w) > 2])

In [None]:
frequencies = nltk.FreqDist([w for r in reviews["filtered_description"] for w in r])

In [None]:
low_frequency = np.percentile(list(frequencies.values()), 50)

In [None]:
reviews["filtered_description"] = reviews["filtered_description"].apply(lambda x: [w for w in x if frequencies[w] > low_frequency])

In [None]:
mystem = Mystem()

In [None]:
def lemmatize_words(word_list):
    processed_word_list = []
    for word in word_list:
        word = mystem.lemmatize(word)[0]
        processed_word_list.append(word)
    return processed_word_list    

In [None]:
lemmas = []
for r in tqdm(reviews["filtered_description"]):
    lemmas.append(lemmatize_words(r))
reviews["filtered_description"] = lemmas

# Формирование выборки
### (Из подмножества вин)

In [None]:
texts = reviews["filtered_description"].tolist()

In [None]:
def get_word_index(w, words):
    if w not in words.keys():
        words[w] = len(words.keys()) + 1
    return words[w]

In [None]:
def extract_feature(x, feature):
    if feature not in instances[x].keys():
        return None
    if type(instances[x][feature]) is list:
        return np.random.choice(instances[x][feature])
    return instances[x][feature]
    
for f in ["locatedIn", "madeFromGrape", "hasSugar", "hasBody", "hasFlavor", "hasColor"]:
    reviews[f + "_feature"] = reviews["wine_from_ontology"].apply(lambda x: extract_feature(x, f))

In [None]:
reviews["color"] = reviews["hasColor_feature"]

In [None]:
reviews.to_csv("reviews_with_wine_features.csv")

In [None]:
reviews["color"].value_counts()

In [None]:
y = pd.get_dummies(reviews, columns=["color"])[["color_Red", "color_White", "color_Rose"]].as_matrix()

Выбрать случайным образом стартовые узлы онтологии

In [None]:
ratio = 0.5
all_wines = reviews["wine_from_ontology"].unique()
train_wines = np.random.choice(reviews["wine_from_ontology"].unique(), size=int(all_wines.shape[0] * ratio))

In [None]:
train = reviews[reviews["wine_from_ontology"].apply(lambda x: x in train_wines)]

In [None]:
test = reviews.drop(train.index)

In [None]:
words = {}
X_train = [[get_word_index(w, words) for w in text] for text in tqdm(train["filtered_description"])]

In [None]:
y_train = y[train.index]

In [None]:
X_test = [[get_word_index(w, words) for w in text] for text in tqdm(test["filtered_description"])]

In [None]:
y_test = y[test.index]

In [None]:
reviews[train.index]["train"] = True
reviews[test.index]["train"] = False

# Классификация логистической регрессией

TODO имплементировать логистическую регрессию

# Классификация моделью Yoon Kim

In [None]:
# set parameters:
max_features = 20000  # vocabulary size
maxlen = 100  # maximum length of the review
batch_size = 32
embedding_dims = 20
ngram_filters = [3, 5, 7]
nb_filter = 1200  # number of filters for each ngram_filter
nb_epoch = 5

Преобразовать к bag of words каждое предложение

In [None]:
# prepare data
print('Loading data...')
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [None]:
input_layer =  Input(shape=(maxlen,))
embeddings = Embedding(max_features, embedding_dims, input_length=maxlen)(input_layer)
dropout = Dropout(0.)(embeddings)
outs = []
for n_gram in ngram_filters:
    convolution = Convolution1D(filters=nb_filter,
                                 kernel_size=n_gram,
                                 padding='valid',
                                 activation='relu',
                                 strides=1,
                                 input_shape=(embedding_dims, maxlen))(dropout)
    pooling = MaxPooling1D(pool_size=maxlen - n_gram + 1)(convolution)
    flatten = Flatten()(pooling)
    outs.append(flatten)
added = Add()(outs) 
dropout = Dropout(0.)(added)
dense = Dense(3, input_dim=nb_filter * len(ngram_filters))(dropout)
activation = Activation('sigmoid')(dense)

model = Model(inputs=input_layer, outputs=activation)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=nb_epoch,
          validation_data=(X_test, y_test),)

In [None]:
model.save('color_classification.h5')

In [None]:
# for w in train_wines:
#     wine_reviews = reviews[reviews["wine_from_ontology"] == w]["filtered_description"][0:5]
#     wine_reviews_texts = [[get_word_index(w) for w in text] for text in wine_reviews]
#     wine_reviews_padded_texts = sequence.pad_sequences(wine_reviews_texts, maxlen=maxlen)
#     train_wine_colors[w] = ["Red", "White", "Rose"][np.argmax(model.predict(wine_reviews_padded_texts).mean(axis=0))]

In [None]:
# for w in test["wine_from_ontology"].unique():
#     wine_reviews = reviews[reviews["wine_from_ontology"] == w]["filtered_description"][0:5]
#     wine_reviews_texts = [[get_word_index(w) for w in text] for text in wine_reviews]
#     wine_reviews_padded_texts = sequence.pad_sequences(wine_reviews_texts, maxlen=maxlen)
#     test_wine_colors[w] = ["Red", "White", "Rose"][np.argmax(model.predict(wine_reviews_padded_texts).mean(axis=0))]

In [None]:
# train_answers = model.predict(X_train)

In [None]:
# test_answers = model.predict(X_test)

In [None]:
# padded_X = sequence.pad_sequences(X, maxlen=maxlen)

In [None]:
# predicted_y = model.predict(X_test)

In [None]:
# answers = np.array(["Red", "White", "Rose"])[np.argmax(predicted_y, axis=1)]

# Сохранение восстановленного фрагмента онтологии

In [None]:
X = [[get_word_index(w, words) for w in text] for text in tqdm(texts)]

In [None]:
X_padded = sequence.pad_sequences(X, maxlen=maxlen)

In [None]:
batches = []

In [None]:
for i in tqdm(range(int(len(X_padded) / batch_size))):
    batches.append(model.predict(X_padded[i*batch_size:(i+1)*batch_size]))

In [None]:
batches.append(model.predict(X_padded[len(batches) * batch_size:]))

In [None]:
predicted_y = [x for l in batches for x in l]

In [None]:
reviews["guessed_color"] = np.array(["Red", "White", "Rose"])[np.argmax(predicted_y, axis=1)]

# Визуализация исходной и восстановленной онтологии

- Определить исходный цвет 
- Определить угаданный цвет

In [74]:
train_wines = train["wine_from_ontology"].unique()

In [76]:
test_wines = test["wine_from_ontology"].unique()

In [None]:
guessed_color_matching = pd.get_dummies(reviews, columns=["guessed_color"])[["wine_from_ontology", "guessed_color_Red", "guessed_color_White"]]
guessed_color_matching = guessed_color_matching.groupby("wine_from_ontology").mean()

In [None]:
color_matching = pd.get_dummies(reviews, columns=["color"])[["wine_from_ontology", "color_Red", "color_White"]]
color_matching = color_matching.groupby("wine_from_ontology").mean()

In [None]:
train_nodes = [{"id": w, "type": "wine"} for w in wine]
test_nodes = [{"id": w, "type": "wine", "new": True} for w in wine]
nodes = train_nodes + test_nodes

In [None]:
nodes += [{"id": "Red", "type": "feature"}, {"id": "White", "type": "feature"}]

In [None]:
links = []

In [None]:
for i, r in color_matching.iterrows():
    if (r.sum() > 0):
        links.append({
            "source": i,
            "target": np.argmax(r).replace("color_", ""),
            "type": "original"
        })

In [None]:
for i, r in guessed_color_matching.iterrows():
    links.append({
        "source": i,
        "target": np.argmax(r).replace("guessed_color_", ""),
        "type": "guessed"
    })

In [None]:
graph = {
    "nodes": nodes,
    "links": links
}

In [None]:
json.dumps(graph)