# Python Group Presentation

### Using NLP and Machine Learning to predict price of wines using wine reviews

#### Group Members 
Onno Ho,
Sabrina Lin,
Shaun Ang,
Natalie Rohr,
Shaun Whitmarsh,
Jemma Shin

### Import and Download the NLP Module 

nltk.download()

### Import Dataset, Clean, and View 

In [6]:
import nltk
import os
import pandas as pd
import numpy as np
import math as math

#view your current working directory 
print("Current Working Directory: " , os.getcwd())

#import data

data = pd.read_csv('winemag-data-130k-v2.csv') #imports dataset as 'data'
data.rename(columns={'Unnamed: 0':'Index Number'}, inplace=True) #renames first column
data.head() #views top few rows of the dataframe


Current Working Directory:  C:\Users\Shaun


Unnamed: 0,Index Number,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


### Lower case description

In [5]:
data['description'] = data['description'].str.lower()

### Tokenize Descriptive Text

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

data['description_tokenized'] = data['description'].apply(nltk.word_tokenize)
print(data['description_tokenized'])

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Shaun/nltk_data'
    - 'C:\\Users\\Shaun\\Anaconda3\\nltk_data'
    - 'C:\\Users\\Shaun\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Shaun\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Shaun\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


### Remove Stopwords

In [4]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

data['filtered_description'] = data['description_tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
print(data['filtered_description'])

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\Shaun/nltk_data'
    - 'C:\\Users\\Shaun\\Anaconda3\\nltk_data'
    - 'C:\\Users\\Shaun\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Shaun\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Shaun\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


### Stem Words

In [None]:
from nltk.stem import SnowballStemmer

ps = SnowballStemmer("english")

data['stemmed_description'] = data['filtered_description'].apply(lambda x: [ps.stem(y) for y in x])
print(data['stemmed_description'])

### Add back cleaned description to the dataset, remove working columns

In [None]:
data['description_clean']= data['stemmed_description']

data = data.drop(columns=['description_tokenized', 'filtered_description','stemmed_description'])

data.head()

### Data Summary Stats

In [None]:
print("There are {} observations and {} features in this dataset. \n".format(data.shape[0],data.shape[1]))

print("There are {} types of wine in this dataset such as {}... \n".format(len(data.variety.unique()),
                                                                           ", ".join(data.variety.unique()[0:7])))

print("There are {} countries producing wine in this dataset such as {}... \n".format(len(data.country.unique()),
                                                                                      ", ".join(data.country.unique()[0:7])))

### Extract the Wine Vintage from the Name ('title') of the Wine

In [8]:
from dateutil.parser import parse

n_rows = len(data.index)

vintage = []

for x in range(n_rows):
    try:
        vintage.append(parse(data.at[x,'title'], fuzzy=True).year)
    except ValueError:
        vintage.append(None)
        continue
        
data['vintage'] = pd.Series(vintage) 



In [9]:
data.head()

Unnamed: 0,Index Number,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,vintage
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013.0
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011.0
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013.0
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013.0
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012.0


### Clean up wine variety into 6 common wines and others

In [73]:
n_rows = len(data.index)

variety_cleaned = []

for x in range(n_rows):
    y = data.at[x,'variety']
    if 'Chardonnay' == y:
        z = 'Chardonnay'
    elif 'Sauvignon Blanc' == y:
        z = 'Sauvignon Blanc'
    elif 'Riesling' == y:
        z = 'Riesling'
    elif 'Cabernet Sauvignon' == y:
        z = 'Cabernet Sauvignon'
    elif 'Merlot' == y:
        z = 'Merlot'
    elif 'Pinot Noir' == y:
        z = 'Pinot Noir'
    else:
        z = 'Others'
    variety_cleaned.append(z)

data['variety_cleaned'] = pd.Series(variety_cleaned)
data.head()

Unnamed: 0,Index Number,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,vintage,variety_cleaned
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013.0,Others
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011.0,Others
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013.0,Others
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013.0,Riesling
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012.0,Pinot Noir


In [74]:
np.count_nonzero(data['variety_cleaned'] == 'Chardonnay')
#11753 Chardonnay

11753

In [75]:
np.count_nonzero(data['variety_cleaned'] == 'Sauvignon Blanc')
#4967 Sauvignon Blanc

4967

In [76]:
np.count_nonzero(data['variety_cleaned'] == 'Riesling')
#5189 Riesling

5189

In [77]:
np.count_nonzero(data['variety_cleaned'] == 'Cabernet Sauvignon')
#9472 Riesling

9472

In [78]:
np.count_nonzero(data['variety_cleaned'] == 'Merlot')
#3102 Merlot

3102

In [79]:
np.count_nonzero(data['variety_cleaned'] == 'Pinot Noir')
#13272 Pinot Noir

13272

In [80]:
np.count_nonzero(data['variety_cleaned'] == 'Others')
#82216 Others

82216

### Remove NA Values in Price 

In [None]:
data = data.dropna(subset=['price'])

### Optimization

In [None]:
from gurobipy import *

data.head()

data1 = data.values

title, country, points, price, variety, vintage = multidict({item[11]: (item[1],item[4],item[5],item[12],item[15]) for item in data.values})

b = 12 #number of bottles

m = Model('wine_box')

x = m.addVars(title, vtype=GRB.INTEGER,lb=0, name = 'wine') 

m.setObjective(quicksum((price[i]*x[i]) for i in title), GRB.MINIMIZE)

#1) There are 12 bottles 

#2) First 6 bottles will be the 6 most common varieties. Chardonnay, Sauvignon Blanc, Riesling, Cabernet Sauvignon, Merlot, Pinot Noir

#3) No 3 bottles can come from the same country

#4) No 3 bottles can come from the same vintage year

#5) The 12 bottles should have an average score of more than 88.44 points (which is above the mean)

#m.optimize()

# print optimal solutions
#for v in m.getVars():
 #   print('%s %g' % (v.varName, v.x))
    
#print optimal value
#print('Obj: %g' % m.objVal)

In [None]:
country