In [39]:
import pandas as pd
import spacy
from gensim.utils import simple_preprocess
import numpy
import re
import pprint
import requests
from bs4 import BeautifulSoup
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')
stop_words.extend(['good', 'fresh', 'great', 'new', 'ready', 'firm', 'young', 'attractive', 'fine',
                   'clean', 'little','currant', 'fine', 'open', 'long', 'tight', 'concentrated',
                   'lively'])
from tqdm import tqdm
tqdm.pandas()

wine_data = pd.read_csv("../wine_data_w_clean.csv")

In [2]:
clean_desc = wine_data.clean

In [3]:
clean_desc

0       chapoutiers selections   best parcels  vines  ...
1       aromas lean toward black cherry chocolate  her...
2       certainly one   successes   vintage packed  ca...
3       plush smooth  packed tight  complex flavors bl...
4       even though  42 months  wine  spent  new wood ...
                              ...                        
7185     lick  vanilla precedes red berries   nose   p...
7186    faint notes  red currant  rhubarb appear   nos...
7187    mixed berries cracked pepper  baking spices ma...
7188     copper rose hue    strong phenolic grip   bur...
7189    steely  sharp yet delicately sweet  mediumbodi...
Name: clean, Length: 7190, dtype: object

In [4]:
def remove_stopwords(docs):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words and len(word) >= 3] for doc in tqdm(docs)]

def tokenization(docs):
    for doc in tqdm(docs):
        yield simple_preprocess(str(doc), deacc=True)

#takes list of docs
def lemmatize(docs):
    lemma = WordNetLemmatizer()
    return [[lemma.lemmatize(word) for word in doc]for doc in tqdm(docs)]

def allow_postages(docs, allowed_postags=['ADJ']): #'ADV', 'ADJ', 'PROPN','VERB'
    nlp = spacy.load("en_core_web_sm")
    texts_out = []
    for doc in tqdm(docs):
        doc = nlp(" ".join(doc))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [5]:
data = wine_data.clean.values.tolist()

In [6]:
print("Starting Tokenization")
data_words = list(tokenization(data))
print("Starting Lemmatization")
data_words = lemmatize(data_words)
print("Starting Remove Stopwards")
data_words = remove_stopwords(data_words)
print("Starting Allowed Postages")
data_words = allow_postages(data_words)

#data_words[:30]


Starting Tokenization


100%|██████████| 7190/7190 [00:00<00:00, 12400.70it/s]


Starting Lemmatization


100%|██████████| 7190/7190 [00:02<00:00, 3037.50it/s]


Starting Remove Stopwards


100%|██████████| 7190/7190 [00:00<00:00, 8314.83it/s]


Starting Allowed Postages


100%|██████████| 7190/7190 [00:49<00:00, 146.32it/s]


In [7]:
data_flat = [i for j in data_words for i in j]
wine_descriptives = pd.DataFrame(data_flat, columns = ["descriptions"])

#wine_descriptives.rename(columns = {"Unnamed: 0":"data = {'descriptions' : data_flat}"}, inplace=True)

In [8]:
descriptions = wine_descriptives.groupby('descriptions').size().sort_values(ascending=False)
descriptions = pd.DataFrame(descriptions).reset_index().rename(columns={0:'count'})

In [9]:
code_book = descriptions[descriptions['count'] > 273][1:]
#codes = 'A B C D E F G H I J K L M O P Q R S T U'.split(" ")
#code_book['code'] = codes
code_book['index'] = [i for i in range(20)]

# code_dict = code_book.set_index('descriptions').to_dict()['code']
# code_dict

code_index = code_book.set_index('descriptions').to_dict()['index']
code_index

{'black': 0,
 'rich': 1,
 'dry': 2,
 'sweet': 3,
 'soft': 4,
 'crisp': 5,
 'full': 6,
 'bright': 7,
 'red': 8,
 'dark': 9,
 'juicy': 10,
 'green': 11,
 'light': 12,
 'herbal': 13,
 'tannic': 14,
 'smooth': 15,
 'earthy': 16,
 'dense': 17,
 'elegant': 18,
 'white': 19}

In [10]:
def encoding(x):
    out_str = ['0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0','0']
    for word in x.split(" "):
        if word in code_index.keys():
            out_str[int(code_index[word])] = '1'
    return ",".join(out_str)

#checking if the word is in the descriptive and then producing an encoding as a result
wine_data['cleaned'] = [" ".join(i) for i in data_words]
wine_data['encoded'] = wine_data.apply(lambda row: encoding(row["cleaned"]), axis=1)

In [11]:
variety = pd.DataFrame(wine_data.groupby('variety').size()).reset_index().rename(columns={0:"count"})
variety



Unnamed: 0,variety,count
0,Agiorgitiko,2
1,Aglianico,22
2,Albana,3
3,Albariño,22
4,Albarossa,1
...,...,...
292,Xarel-lo,1
293,Xinomavro,1
294,Zierfandler,1
295,Zinfandel,166


TODO:
----
Ok so finding the color might be a bit tricky, best idea so far is to scrape form a google search  using the google.com/search?q=<insertwinehere\>+wine+color
then just making a regex string of all the wine types and finding type that is mentioned the most

expected road blocks:
google might not be thrilled about a single ip making requests that quickly so might have to add some
wait time inbetween requests. Im not great at scraping so that is an issue too.


In [12]:
def find_color(string):
    r = requests.get("https://api.duckduckgo.com",
    params = {
        "q": string,
        "format": "json"
    })
    data = r.json()
    return data['Abstract']

In [27]:
test = variety
#test['abstract'] = test.apply(lambda row: find_color(row['variety']), axis=1)
test['abstract'] = [find_color(vary) for vary in tqdm(test['variety'])]

abstract = list(tokenization(test['abstract']))
print("Starting Lemmatization")
abstract = lemmatize(abstract)
print("Starting Remove Stopwards")
abstract = remove_stopwords(abstract)
print("Starting Allowed Postages")
abstract = allow_postages(abstract)

test['abs_cleaned'] = [" ".join(i) for i in abstract]

100%|██████████| 297/297 [01:46<00:00,  2.80it/s]
100%|██████████| 297/297 [00:00<00:00, 6006.31it/s]


Starting Lemmatization


100%|██████████| 297/297 [00:00<00:00, 5558.47it/s]


Starting Remove Stopwards


100%|██████████| 297/297 [00:00<00:00, 5582.91it/s]


Starting Allowed Postages


100%|██████████| 297/297 [00:01<00:00, 165.34it/s]


In [28]:
test

Unnamed: 0,variety,count,abstract,abs_cleaned
0,Agiorgitiko,2,Agiorgitiko is a red Greek wine grape variety ...,red greek red xynomavro attiki important indig...
1,Aglianico,22,Aglianico is a black grape grown in the southe...,black southern sangiovese great italian south ...
2,Albana,3,,
3,Albariño,22,Albariño or Alvarinho is a variety of white wi...,varietal white albarino galician twelfth recent
4,Albarossa,1,Albarossa is a red Italian wine grape variety ...,albarossa red italian nebbiolo nebbiolo old fr...
...,...,...,...,...
292,Xarel-lo,1,Xarel·lo is a white grape variety of Spanish o...,spanish faux spanish traditional spanish aroma...
293,Xinomavro,1,Xinomavro is the principal red wine grape of t...,red amyntaio trikomo less global worldwide ini...
294,Zierfandler,1,Zierfandler is a grape variety used to make wh...,white red single varietal elegant sweet almond...
295,Zinfandel,166,Zinfandel is a variety of black-skinned wine g...,black skinned equivalent croatian primitivo mi...


In [40]:
def find_designation_query(string, query=""):
    if query != "":
        r = requests.get("https://api.duckduckgo.com",
        params = {
            "q": string + query,
            "format": "json"
        })
        data = r.json()

        return data['Abstract']
test_designation = pd.DataFrame(wine_data.groupby('designation').size()).reset_index().rename(columns={0:"count"})
test_designation.progress_apply(lambda row: find_designation_query(row['designation'], query='wine color'), axis=1)

 45%|████▍     | 2374/5298 [22:41<27:56,  1.74it/s] 


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))