Добавление нового признака:
- Составить выборку bag-of-words
- Составить классификаторы для каждого признака
- Получить векторы признаков заданных изначально классификаторов
- Привести матрицу к треугольному виду
- Выделить часть, которая не объясняется в базисе признаков
- Применить однокомпонентный PCA
- Получить классификатор для него
- Сравнить с тем, который реально есть для отсутствующего признака

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import numpy as np
from tqdm import *
from sklearn.externals import joblib
import scipy.sparse as sps

# Формирование выборки

In [2]:
reviews = pd.read_csv("reviews_with_wine_features.csv")

In [3]:
reviews["filtered_description"] = reviews["filtered_description"].apply(lambda x: " ".join(eval(x)))

In [4]:
def get_bag_of_words(texts, vectorizer=None):
    if not vectorizer:
        vectorizer = CountVectorizer()
        vectorizer.fit(texts)
    transformed_texts = vectorizer.transform(texts)
    return transformed_texts, vectorizer

In [5]:
features = [c for c in reviews.columns if "_feature" in c]
reviews = pd.get_dummies(reviews, columns=features)

In [6]:
features = [c for c in reviews.columns if "_feature" in c]
features = [c for c in features if "madeFromGrape" not in c]

In [7]:
big_enough_features = np.array(features)[np.nonzero(reviews[features].mean() > 0.01)[0]]

In [8]:
big_enough_features

array(['locatedIn_feature_Alsace', 'locatedIn_feature_Bordeaux',
       'locatedIn_feature_Bourgogne', 'locatedIn_feature_California',
       'locatedIn_feature_Italy', 'locatedIn_feature_Portugal',
       'locatedIn_feature_US', 'hasSugar_feature_Dry',
       'hasSugar_feature_OffDry', 'hasSugar_feature_Sweet',
       'hasBody_feature_Full', 'hasBody_feature_Light',
       'hasBody_feature_Medium', 'hasFlavor_feature_Delicate',
       'hasFlavor_feature_Moderate', 'hasFlavor_feature_Strong',
       'hasColor_feature_Red', 'hasColor_feature_White'],
      dtype='<U32')

In [9]:
selected_features = ['hasSugar_feature_Dry',
       'hasSugar_feature_OffDry', 'hasSugar_feature_Sweet',
       'hasBody_feature_Full', 'hasBody_feature_Light',
       'hasBody_feature_Medium', 'hasFlavor_feature_Delicate',
       'hasFlavor_feature_Moderate', 'hasFlavor_feature_Strong',
       'hasColor_feature_Red', 'hasColor_feature_White']

In [10]:
valid_classes = []
for feature in selected_features:
    valid_classes.append(reviews[feature])

# Подготовка классификаторов

In [11]:
def train_classifier(X, y):
    grid = {
        'C': np.linspace(0.01, 1, 5)
    }
    classifier = LogisticRegression()
    search = GridSearchCV(estimator=classifier, param_grid=grid, cv=3, scoring='roc_auc', verbose=True)
    search.fit(X, y)
    return search

In [12]:
classifiers = []
X, vectorizer = get_bag_of_words(reviews["filtered_description"])

In [None]:
for y in tqdm(valid_classes):
    grid_search = train_classifier(X, y)
    print(grid_search.best_score_)
    classifiers.append(grid_search.best_estimator_)

In [26]:
joblib.dump(classifiers, 'classifiers.pkl')

['classifiers.pkl']

In [13]:
classifiers = joblib.load('classifiers.pkl')

# Составление базиса

Выбор признаков красный/белый

In [14]:
selected_classifiers = classifiers[:-2]

In [15]:
features_vectors = np.array([c.coef_[0] for c in selected_classifiers])

In [16]:
features_basis = np.triu(features_vectors)

In [17]:
inverted_features_basis = np.linalg.pinv(features_basis)

In [None]:
(X * sps.csr_matrix(features_basis).T) * sps.csr_matrix(inverted_features_basis).T

In [None]:
unexplained_X = X - _

# Составление нового признака и сравнение с настоящими значениями

In [None]:
pca = PCA(n_components=1)
new_feature_values = pca.fit_transform(unexplained_X)
new_feature_coefficients = pca.components_

In [None]:
new_feature_bias_index = (new_feature_values - new_feature_values.shift()).argmax()
new_feature_bias = new_feature_values[new_feature_bias_index] + (new_feature_values[new_feature_bias_index + 1] - new_feature_values[new_feature_bias_index])

Сравнение осуществляется через классификацию и просмотр результатов

In [None]:
new_classifer = LogisticRegression()
new_classifier.coef_ = new_feature_coefficients
new_classifier.intercept_ = new_feature_bias
new_feature_probabilities = new_classifier.predict(X)
real_feature_probabilities = classifiers[-1].predict(X)

In [None]:
new_feature_probabilities - real_feature_probabilities