# Machine Learning - Wine Data

In [1]:
import pandas as pd
import re, datetime
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('english')
import string

### Basic Cleaning

In [2]:
all_wines = pd.read_csv('../Data/winemag-data-130k-v2.csv')

Dropping columns we don't need.

In [3]:
clean_wines = all_wines.drop(columns=['Unnamed: 0', 'taster_twitter_handle', 'region_2'])
clean_wines = clean_wines.dropna()

Replacing/translating wines from other countries into English origin.

In [4]:
clean_wines['variety'] = clean_wines['variety'].replace(['Weissburgunder'], 'Chardonnay')
clean_wines['variety'] = clean_wines['variety'].replace(['Spatburgunder'], 'Pinot Noir')
clean_wines['variety'] = clean_wines['variety'].replace(['Grauburgunder'], 'Pinot Gris')
clean_wines['variety'] = clean_wines['variety'].replace(['Garnacha'], 'Grenache')
clean_wines['variety'] = clean_wines['variety'].replace(['Pinot Nero'], 'Pinot Noir')
clean_wines['variety'] = clean_wines['variety'].replace(['Alvarinho'], 'Albarino')

Extracting the vintage from the title, creating a new column for the year, then dropping the title column due to redundant information.

In [5]:
clean_wines['vintage'] = clean_wines['title'].str.extract('(\d\d\d\d)', expand=True)
clean_wines = clean_wines.drop(columns='title')

Creating new column for type of wine (i.e., white and red).

In [6]:
clean_wines['type'] = ['white' if x in ['Chardonnay', 'Riesling', 'Sauvignon Blanc', 'White Blend', 'Sparkling Blend', 'Pinot Gris',
                                        'Champagne Blend', 'GrÃ¼ner Veltliner', 'Pinot Grigio', 'Portuguese White', 'Viognier',
                                        'GewÃ¼rztraminer', 'GewÃ¼rztraminer'] else 'red' for x in clean_wines['variety']]

Creating a new column for word count in the description, may potentially see some correlation down the road. 

In [7]:
clean_wines['word count'] = clean_wines['description'].str.lower().str.split().str.len()

Creating a new feature to easily feed the model, a column that describes the age of each wine. 

In [8]:
clean_wines = clean_wines[clean_wines['vintage'].notna()]
clean_wines = clean_wines.copy()
clean_wines['vintage'] = clean_wines['vintage'].astype(int)
clean_wines['age'] = 2020 - clean_wines['vintage'] 

Dropping duplicate rows.

In [9]:
clean_wines = clean_wines.drop_duplicates('description')
clean_wines = clean_wines[pd.notnull(clean_wines.price)]

Tokenizing the description column to break apart the sentence into individual words, which will then be parsed.

In [10]:
clean_wines['tokenized_text'] = clean_wines['description'].apply(word_tokenize) 

NLTK has a problem reading stop words which are upper-case, so we make sure all of the words in this column lower-case to make sure we get as many as we can.

In [11]:
clean_wines['tokenized_text'] = clean_wines['tokenized_text'].apply(lambda x: [w.lower() for w in x])

We remove stop words here.

In [22]:
stop.append("'s")
stop.append(' ')


In [23]:
clean_wines['tokenized_text'] = clean_wines['tokenized_text'].apply(lambda x: [item for item in x if item not in stop])

Resetting the index to help with the merge.

In [24]:
clean_wines = clean_wines.reset_index(drop=True)

Creating a separate dataframe that explodes each individual string in the list to a separate column.

In [25]:
descriptors = pd.DataFrame(clean_wines.tokenized_text.values.tolist()).add_prefix('desc_')

Resetting the index to help with the merge.

In [26]:
descriptors = descriptors.reset_index(drop=True)

Merging the two dataframes on the reset index.

In [27]:
expanded_df = pd.merge(clean_wines, descriptors, left_index=True, right_index=True)

Dropping the tokenized and description columns.

In [28]:
expanded_df = expanded_df.drop(columns=['tokenized_text', 'description'])

Through the tokenization and splitting the strings, I have a ton of leftover punctuation. This strips that from the dataframe.

In [29]:
expanded_df = expanded_df.replace('['+string.punctuation+']', '', regex=True)

Exporting to CSV to use in Tableau.

In [30]:
#clean_wines.to_csv('../Data/cleaned_wine_data.csv',index=False)
expanded_df.to_csv('../Data/expanded_wine_data.csv',index=False)