In [18]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [19]:
# Import data
US_wine_data_df_ml = pd.read_csv("Data/US_wine_data_ml.csv")

In [20]:
US_wine_data_df_ml.nunique()

points         21
region_1      266
variety       257
price_bins      6
dtype: int64

In [21]:
# Determine value_counts for binning
variety_counts = US_wine_data_df_ml.variety.value_counts()
variety_counts

Pinot Noir                   9857
Cabernet Sauvignon           7280
Chardonnay                   6773
Syrah                        3232
Red Blend                    2948
                             ... 
Merlot-Petite Verdot            1
Pinot Gris-Gewürztraminer       1
Malbec-Cabernet Sauvignon       1
Sangiovese Grosso               1
Pinot Nero                      1
Name: variety, Length: 257, dtype: int64

In [26]:
# Determine which values to replace.  Per the plot, we tried <500 but after looking
# results we decided that we wanted to be more inclusive since some mainstream wines
# would be binned in "other".  Therefore we lowered the threshold for other to 300.
replace_variety = list(variety_counts[variety_counts <= 300].index)

# Replace in dataframe
for variety in replace_variety:
    US_wine_data_df_ml.variety = US_wine_data_df_ml.variety.replace(variety,"Other")
    
# Check to make sure binning was successful
US_wine_data_df_ml.variety.value_counts()

Pinot Noir                  9857
Cabernet Sauvignon          7280
Chardonnay                  6773
Other                       4569
Syrah                       3232
Red Blend                   2948
Zinfandel                   2705
Merlot                      2300
Sauvignon Blanc             2154
Bordeaux-style Red Blend    1817
Riesling                    1745
Cabernet Franc               999
Rosé                         900
Pinot Gris                   863
Viognier                     786
Petite Sirah                 745
Rhône-style Red Blend        690
Sparkling Blend              678
White Blend                  634
Malbec                       533
Grenache                     520
Sangiovese                   429
Gewürztraminer               405
Pinot Grigio                 360
Tempranillo                  343
Name: variety, dtype: int64

In [27]:
# Determine value_counts for binning
region_1_counts = US_wine_data_df_ml.region_1.value_counts()
region_1_counts

Other                        9568
Napa Valley                  4475
Columbia Valley (WA)         4109
Russian River Valley         3090
California                   2629
Paso Robles                  2327
Willamette Valley            2296
Finger Lakes                 1556
Sonoma Coast                 1467
Sonoma County                1245
Walla Walla Valley (WA)      1061
Carneros                      999
Santa Barbara County          994
Yakima Valley                 984
Sta. Rita Hills               971
Dry Creek Valley              936
Santa Ynez Valley             806
Santa Lucia Highlands         802
Lodi                          799
Alexander Valley              780
Central Coast                 737
Red Mountain                  719
Santa Maria Valley            701
Sonoma Valley                 660
Anderson Valley               646
North Fork of Long Island     596
Dundee Hills                  554
Horse Heaven Hills            554
Virginia                      552
Santa Cruz Mou

In [28]:
# From the plot the curve breaks around 500.
replace_region_1 = list(region_1_counts[region_1_counts <= 300].index)

# Replace in dataframe
for region_1 in replace_region_1:
    US_wine_data_df_ml.region_1 = US_wine_data_df_ml.region_1.replace(region_1,"Other")
    
# Check to make sure binning was successful
US_wine_data_df_ml.region_1.value_counts()

Other                        9568
Napa Valley                  4475
Columbia Valley (WA)         4109
Russian River Valley         3090
California                   2629
Paso Robles                  2327
Willamette Valley            2296
Finger Lakes                 1556
Sonoma Coast                 1467
Sonoma County                1245
Walla Walla Valley (WA)      1061
Carneros                      999
Santa Barbara County          994
Yakima Valley                 984
Sta. Rita Hills               971
Dry Creek Valley              936
Santa Ynez Valley             806
Santa Lucia Highlands         802
Lodi                          799
Alexander Valley              780
Central Coast                 737
Red Mountain                  719
Santa Maria Valley            701
Sonoma Valley                 660
Anderson Valley               646
North Fork of Long Island     596
Dundee Hills                  554
Horse Heaven Hills            554
Virginia                      552
Santa Cruz Mou

## Encode Categorical Data

In [12]:
# Create variable to hold categorical columns for OneHotEncoder
wine_cat = ["variety", "region_1"]

In [30]:
# from sklearn.preprocessing import OneHotEncoder
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(US_wine_data_df_ml[wine_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(wine_cat)
encode_df.shape
encode_df.head()

Unnamed: 0,variety_Bordeaux-style Red Blend,variety_Cabernet Franc,variety_Cabernet Sauvignon,variety_Chardonnay,variety_Gewürztraminer,variety_Grenache,variety_Malbec,variety_Merlot,variety_Other,variety_Petite Sirah,...,region_1_Sonoma Coast,region_1_Sonoma County,region_1_Sonoma Valley,region_1_Sta. Rita Hills,region_1_Virginia,region_1_Wahluke Slope,region_1_Walla Walla Valley (WA),region_1_Washington,region_1_Willamette Valley,region_1_Yakima Valley
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Merge one-hot encoded features and drop the originals
US_wine_data_df_ml = US_wine_data_df_ml.merge(encode_df,left_index=True, right_index=True)
US_wine_data_df_ml = US_wine_data_df_ml.drop(wine_cat,axis=1)
print(US_wine_data_df_ml.shape)
US_wine_data_df_ml.head()

(54265, 73)


Unnamed: 0,points,price_bins,variety_Bordeaux-style Red Blend,variety_Cabernet Franc,variety_Cabernet Sauvignon,variety_Chardonnay,variety_Gewürztraminer,variety_Grenache,variety_Malbec,variety_Merlot,...,region_1_Sonoma Coast,region_1_Sonoma County,region_1_Sonoma Valley,region_1_Sta. Rita Hills,region_1_Virginia,region_1_Wahluke Slope,region_1_Walla Walla Valley (WA),region_1_Washington,region_1_Willamette Valley,region_1_Yakima Valley
0,87,<$15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,87,<$15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,87,$60-100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,87,$15-30,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,87,$30-60,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Save preprocessed dataframe to csv for future reference.
US_wine_data_df_ml.to_csv("Data/US_wine_data_enc.csv", index=False)

In [33]:
# Import data
US_wine_data_df_ml = pd.read_csv("Data/US_wine_data_enc.csv")

## Split Preprocessed Data

In [37]:
# Split preprocessed data into our features and target arrays
#  Target
y = US_wine_data_df_ml["price_bins"].values
# Features
X = US_wine_data_df_ml.drop(["price_bins"],axis=1).values

In [38]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [39]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
print(X_train_scaled [0:5])

[[ 1.7447257  -0.18748622 -0.13627336 -0.39263251 -0.37863285 -0.08675177
  -0.09669522 -0.10100397 -0.2118444  -0.30298795 -0.11693615 -0.07971786
  -0.12407186  2.12635075 -0.24094144 -0.1137647  -0.18114385 -0.12878626
  -0.08646379 -0.2036718  -0.11332066 -0.25468221 -0.07734215 -0.12294403
  -0.1119785  -0.22727754 -0.12107719 -0.08617488 -0.10738964 -0.22475533
  -0.13701973 -0.11801104 -0.08002942 -0.28744384 -0.13305798 -0.10100397
  -0.08559422 -0.07456046 -0.17204439 -0.10075542 -0.08958218 -0.12139021
  -0.08156985 -0.08486296 -0.0794051  -0.07877596 -0.30231076 -0.08187454
  -0.10633115 -0.09232853 -0.46208804 -0.21006536 -0.11519662 -0.08456874
   4.05794933 -0.08617488 -0.13877747 -0.10087977 -0.12284102 -0.11332066
  -0.12294403 -0.08703883 -0.16674872 -0.15384664 -0.11141474 -0.13467458
  -0.1016228  -0.08018476 -0.14178246 -0.08972143 -0.20750193 -0.13636686]
 [ 1.42378935 -0.18748622 -0.13627336 -0.39263251 -0.37863285 -0.08675177
  10.34177232 -0.10100397 -0.2118444 

# Machine Learning

## EasyEnsembleClassifier

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import EasyEnsembleClassifier

In [46]:
# Create an EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
y_pred = eec.predict(X_test)
eec

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [47]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
accuracy_score = balanced_accuracy_score(y_test, y_pred)
accuracy_score

0.39787685396188066

In [48]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[1992, 1186,   86,   10, 1379,  725],
       [1064, 1177,  599,   48,  566, 1611],
       [ 112,  150,  234,   46,   69,  566],
       [  17,   13,   18,   27,    9,  190],
       [ 661,  166,    7,    2,  814,   21],
       [   0,    0,    0,    0,    0,    2]], dtype=int64)

In [51]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(f"Accuracy Score : {accuracy_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Accuracy Score : 0.39787685396188066
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

     $15-30       0.52      0.37      0.77      0.43      0.54      0.27      5378
     $30-60       0.44      0.23      0.82      0.30      0.44      0.18      5065
    $60-100       0.25      0.20      0.94      0.22      0.43      0.17      1177
    100-500       0.20      0.10      0.99      0.13      0.31      0.09       274
       <$15       0.29      0.49      0.83      0.36      0.64      0.39      1671
   too much       0.00      1.00      0.77      0.00      0.88      0.79         2

avg / total       0.43      0.31      0.82      0.35      0.50      0.24     13567

