In [1]:
# imports
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

# Специальные библиотеки
from sklearn import ensemble
from sklearn.model_selection import train_test_split, cross_val_score, \
    GridSearchCV, RandomizedSearchCV

# Метрики
from sklearn.metrics import precision_score, \
    accuracy_score, \
    f1_score, \
    roc_auc_score, \
    roc_curve, \
    auc, \
    confusion_matrix, \
    mean_squared_error, \
    r2_score

import dill

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

C:\Users\Admin\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\Admin\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
path = '../../dataset/'
wine_150k = pd.read_csv(path + 'winemag-data_first150k.csv', index_col=0)
wine_130k = pd.read_csv(path + 'winemag-data-130k-v2.csv', index_col=0)
wine = pd.concat([wine_150k, wine_130k], axis=0)
wine_full = pd.concat([wine_150k, wine_130k], axis=0)
print('Dataset loaded')

# feature engineering
features = ['country', 'description', 'designation', 'points', 'price', 'province', 'region_1',
            'variety', 'winery']

wine = wine[features]

cnt_idx = wine['country'].value_counts().to_frame()[:12].index
wine = wine.loc[wine['country'].isin(list(cnt_idx))]

wine = pd.concat([wine.drop('country', axis=1),
                  pd.get_dummies(wine['country'], prefix='country')], axis=1)

pr_idx = wine['province'].value_counts().to_frame()[:50].index
wine = wine.loc[wine['province'].isin(list(pr_idx))]

wine = pd.concat([wine.drop('province', axis=1),
                  pd.get_dummies(wine['province'], prefix='province')], axis=1)

print('Feature engineering complete')
# model
# Можно сделать 100 000, но на обучение уходит примерно 1 час
wine_short = wine.head(10000)
X_train, X_test, y_train, y_test = train_test_split(wine_short.drop('points', axis=1),
                                                    wine_short['points'], test_size=0.33, random_state=42)


Dataset loaded
Feature engineering complete


In [3]:
# load model
with open('../../model_rand_for_reg.dill', 'rb') as in_strm:
    classifier_load = dill.load(in_strm)
    


In [4]:
print('Start prediction')
y_score = classifier_load.predict(X_test)
print(y_score)

Start prediction
[91.92 88.59 88.61 ... 85.9  86.92 91.24]


In [5]:
request_line = {'description': 'This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.',
                 'designation': 'Carodorum Selección Especial Reserva',
                 'price': 155, 
                 'region_1': 'Colchagua Valley', 
                 'variety': 'Nerello Mascalese', 
                 'winery': 'Bodega Carmen Rodríguez', 
                 'country': 'Italy', 
                 'province': 'Burgundy'}

In [6]:
# countries = wine_full['country'].value_counts().to_frame()[:12].index
# pd.Series(countries.values).to_csv('countries.csv', index=False)

countries = pd.read_csv('countries.csv')['0']

# provinces = wine_full['province'].value_counts().to_frame()[:50].index
# pd.Series(provinces.values).to_csv('provinces.csv', index=False)

provinces = pd.read_csv('provinces.csv')['0']

In [7]:
def get_request(r_l):

    r_df = pd.DataFrame(columns=['description', 'designation', 'price', 'region_1', 'variety', 'winery'])
    r_df.loc[len(r_df)] = [r_l['description'], r_l['designation'], r_l['price'], r_l['region_1'], r_l['variety'], r_l['winery']]
    for c_name in countries:
        s = 'country_' + c_name
        if r_l['country'] == c_name: r_df[s] = 1
        else: r_df[s] = 0
    
    for p_name in provinces:
        p = 'province_' + p_name
        if r_l['province'] == p_name: r_df[p] = 1
        else: r_df[p] = 0
    
    return r_df

In [8]:
reqest_predict = classifier_load.predict(get_request(request_line))
reqest_predict

array([91.75])