# Reduce Wine Dataset for Dashboard

In [1]:
# Import dependencies
import pandas as pd

In [2]:
US_wine_data_df = pd.read_csv("Data/US_wine_data_joined.csv")  
print(US_wine_data_df.shape)
US_wine_data_df.head()

(53374, 12)


Unnamed: 0,description,points,price,province,region_1,region_2,title,variety,winery,region,Latitude,Longitude
0,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Willamette Valley,44.942554,-122.933762
1,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,Lake Michigan Shore,42.22087,-86.369469
2,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Willamette Valley,44.942554,-122.933762
3,"Soft, supple plum envelopes an oaky structure ...",87,19.0,California,Napa Valley,Napa,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Napa Valley,38.502469,-122.265389
4,"Slightly reduced, this wine offers a chalky, t...",87,34.0,California,Alexander Valley,Sonoma,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini,Alexander Valley,38.612965,-122.769435


In [3]:
# Drop region_1 and region_2 since region will suffice and has latitudes and longitudes.
dashboard_wine_data_df = US_wine_data_df.drop(columns=["region_1", "region_2"], axis=1)
print(dashboard_wine_data_df.shape)
dashboard_wine_data_df.tail(3)

(53374, 10)


Unnamed: 0,description,points,price,province,title,variety,winery,region,Latitude,Longitude
53371,This opens with herbaceous dollops of thyme an...,90,35.0,California,Hendry 2012 Blocks 7 & 22 Zinfandel (Napa Valley),Zinfandel,Hendry,Napa Valley,38.502469,-122.265389
53372,This Zinfandel from the eastern section of Nap...,90,22.0,California,Houdini 2011 Zinfandel (Chiles Valley),Zinfandel,Houdini,Chiles Valley,38.532564,-122.326147
53373,Citation is given as much as a decade of bottl...,90,75.0,Oregon,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation,Oregon,43.804133,-120.554201


In [4]:
# Look at info.
dashboard_wine_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53374 entries, 0 to 53373
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   description  53374 non-null  object 
 1   points       53374 non-null  int64  
 2   price        53374 non-null  float64
 3   province     53374 non-null  object 
 4   title        53374 non-null  object 
 5   variety      53374 non-null  object 
 6   winery       53374 non-null  object 
 7   region       53374 non-null  object 
 8   Latitude     53374 non-null  float64
 9   Longitude    53374 non-null  float64
dtypes: float64(3), int64(1), object(6)
memory usage: 4.1+ MB


In [5]:
# Create price bins
price_bins = [0, 15, 30, 60, 100, 500, 5000]
dashboard_wine_data_df.groupby(pd.cut(dashboard_wine_data_df["price"], price_bins)).count()

price_bins_names = ["<$15","$15-30","$30-60","$60-100", "100-500","too much"]

# Make a new column in dataframe for price bins.
dashboard_wine_data_df["price_bins"] = pd.cut(dashboard_wine_data_df["price"], price_bins, labels=price_bins_names)

# Check dataframe
dashboard_wine_data_df.tail()

Unnamed: 0,description,points,price,province,title,variety,winery,region,Latitude,Longitude,price_bins
53369,Hailing from one of the more popular vineyards...,90,20.0,California,Birichino 2013 Jurassic Park Vineyard Old Vine...,Chenin Blanc,Birichino,Santa Ynez Valley,34.584154,-120.097369,$15-30
53370,There's no bones about the use of oak in this ...,90,35.0,California,Flora Springs 2013 Barrel Fermented Chardonnay...,Chardonnay,Flora Springs,Napa Valley,38.502469,-122.265389,$30-60
53371,This opens with herbaceous dollops of thyme an...,90,35.0,California,Hendry 2012 Blocks 7 & 22 Zinfandel (Napa Valley),Zinfandel,Hendry,Napa Valley,38.502469,-122.265389,$30-60
53372,This Zinfandel from the eastern section of Nap...,90,22.0,California,Houdini 2011 Zinfandel (Chiles Valley),Zinfandel,Houdini,Chiles Valley,38.532564,-122.326147,$15-30
53373,Citation is given as much as a decade of bottl...,90,75.0,Oregon,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation,Oregon,43.804133,-120.554201,$60-100


In [6]:
# Determine value_counts for binning
region_counts = dashboard_wine_data_df.region.value_counts()
region_counts

Napa Valley                               4475
Columbia Valley (WA)                      4109
Russian River Valley                      3090
California                                2629
Paso Robles                               2327
                                          ... 
Sonoma-Napa-Monterey                         1
Hawaii                                       1
Monterey-San Luis Obispo-Santa Barbara       1
Mendocino-Amador                             1
California-Oregon                            1
Name: region, Length: 248, dtype: int64

In [7]:
# Reduce regions list using same cut-off that was used for machine learning model.
replace_region = list(region_counts[region_counts <= 300].index)

# Replace in dataframe
for region in replace_region:
    dashboard_wine_data_df.region = dashboard_wine_data_df.region.replace(region,"Other")
    
# Check to make sure binning was successful
dashboard_wine_data_df.region.value_counts()

Other                        9396
Napa Valley                  4475
Columbia Valley (WA)         4109
Russian River Valley         3090
California                   2629
Paso Robles                  2327
Willamette Valley            2296
Finger Lakes                 1556
Sonoma Coast                 1467
Sonoma County                1245
Walla Walla Valley (WA)      1061
Carneros                      999
Santa Barbara County          994
Yakima Valley                 984
Sta. Rita Hills               971
Dry Creek Valley              936
Santa Ynez Valley             806
Santa Lucia Highlands         802
Lodi                          799
Alexander Valley              780
Central Coast                 737
Santa Maria Valley            701
Sonoma Valley                 660
Anderson Valley               646
North Fork of Long Island     596
Horse Heaven Hills            554
Dundee Hills                  554
Virginia                      552
Santa Cruz Mountains          547
Oregon        

In [8]:
# Remove regions where region count <= 300
dashboard_wine_data_df = dashboard_wine_data_df[dashboard_wine_data_df.region != "Other"]
print(dashboard_wine_data_df.shape)
dashboard_wine_data_df.head()

(43978, 11)


Unnamed: 0,description,points,price,province,title,variety,winery,region,Latitude,Longitude,price_bins
0,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Willamette Valley,44.942554,-122.933762,<$15
2,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Willamette Valley,44.942554,-122.933762,$60-100
3,"Soft, supple plum envelopes an oaky structure ...",87,19.0,California,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Napa Valley,38.502469,-122.265389,$15-30
4,"Slightly reduced, this wine offers a chalky, t...",87,34.0,California,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini,Alexander Valley,38.612965,-122.769435,$30-60
5,Building on 150 years and six generations of w...,87,12.0,California,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou,Central Coast,29.539237,-95.065481,<$15


In [9]:
# Reduce variety
variety_counts = dashboard_wine_data_df.variety.value_counts()
variety_counts

Pinot Noir                  8569
Chardonnay                  5884
Cabernet Sauvignon          5450
Syrah                       2551
Red Blend                   2416
                            ... 
Silvaner                       1
Valvin Muscat                  1
Clairette                      1
Syrah-Merlot                   1
Roussanne-Grenache Blanc       1
Name: variety, Length: 217, dtype: int64

In [10]:
# Reduce variety list using same cut-off that was used for machine learning model.
replace_variety = list(variety_counts[variety_counts <= 300].index)

# Replace in dataframe
for variety in replace_variety:
    dashboard_wine_data_df.variety = dashboard_wine_data_df.variety.replace(variety,"Other")
    
# Check to make sure binning was successful
dashboard_wine_data_df.variety.value_counts()

Pinot Noir                  8569
Chardonnay                  5884
Cabernet Sauvignon          5450
Other                       3633
Syrah                       2551
Red Blend                   2416
Zinfandel                   2278
Merlot                      1898
Sauvignon Blanc             1691
Bordeaux-style Red Blend    1433
Riesling                    1332
Cabernet Franc               766
Pinot Gris                   754
Rosé                         742
Viognier                     593
Rhône-style Red Blend        585
Petite Sirah                 584
White Blend                  515
Sparkling Blend              510
Malbec                       410
Grenache                     407
Sangiovese                   347
Gewürztraminer               322
Pinot Grigio                 308
Name: variety, dtype: int64

In [11]:
# Remove varieties where variety count <= 300
dashboard_wine_data_df = dashboard_wine_data_df[dashboard_wine_data_df.variety != "Other"]
print(dashboard_wine_data_df.shape)
dashboard_wine_data_df.head()

(40345, 11)


Unnamed: 0,description,points,price,province,title,variety,winery,region,Latitude,Longitude,price_bins
0,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Willamette Valley,44.942554,-122.933762,<$15
2,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Willamette Valley,44.942554,-122.933762,$60-100
3,"Soft, supple plum envelopes an oaky structure ...",87,19.0,California,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Napa Valley,38.502469,-122.265389,$15-30
4,"Slightly reduced, this wine offers a chalky, t...",87,34.0,California,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini,Alexander Valley,38.612965,-122.769435,$30-60
5,Building on 150 years and six generations of w...,87,12.0,California,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou,Central Coast,29.539237,-95.065481,<$15


In [12]:
# Create wine categories/types column
dashboard_wine_data_df["type"] = dashboard_wine_data_df["variety"]

In [13]:
# Categorize varieties

rose = ["Rosé"]
red = ["Pinot Noir", "Cabernet Sauvignon", "Syrah", "Red Blend", "Zinfandel", "Merlot","Bordeaux-style Red Blend", 
       "Cabernet Franc", "Rhône-style Red Blend", "Petite Sirah", "White Blend", "Malbec", "Grenache", "Sangiovese"]
white = ["Chardonnay", "Sauvignon Blanc","Riesling","Pinot Gris","Viognier", "Sparkling Blend", "Gewürztraminer", 
         "Pinot Grigio"]

dashboard_wine_data_df = dashboard_wine_data_df.replace({"type": white},"White")
dashboard_wine_data_df = dashboard_wine_data_df.replace({"type": rose},"Pink")
dashboard_wine_data_df = dashboard_wine_data_df.replace({"type": red},"Red")
dashboard_wine_data_df.head()


Unnamed: 0,description,points,price,province,title,variety,winery,region,Latitude,Longitude,price_bins,type
0,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Willamette Valley,44.942554,-122.933762,<$15,White
2,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Willamette Valley,44.942554,-122.933762,$60-100,Red
3,"Soft, supple plum envelopes an oaky structure ...",87,19.0,California,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Napa Valley,38.502469,-122.265389,$15-30,Red
4,"Slightly reduced, this wine offers a chalky, t...",87,34.0,California,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini,Alexander Valley,38.612965,-122.769435,$30-60,Red
5,Building on 150 years and six generations of w...,87,12.0,California,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou,Central Coast,29.539237,-95.065481,<$15,White


In [14]:
# Re-arrange column order for wine finder display.
dashboard_wine_data_df = dashboard_wine_data_df[['province', 'region', 'Latitude', 'Longitude','winery', 'variety', 'type', 'title', 'points', 'price', 'price_bins','description']]
print(dashboard_wine_data_df.shape)
dashboard_wine_data_df.head(1)

(40345, 12)


Unnamed: 0,province,region,Latitude,Longitude,winery,variety,type,title,points,price,price_bins,description
0,Oregon,Willamette Valley,44.942554,-122.933762,Rainstorm,Pinot Gris,White,Rainstorm 2013 Pinot Gris (Willamette Valley),87,14.0,<$15,"Tart and snappy, the flavors of lime flesh and..."


In [15]:
# Save reduced wine dataframe
dashboard_wine_data_df.to_csv("Data/dashboard_wine_data.csv", index=False)