In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [2]:
import pandas as pd
import numpy as np

Import our white and red wine data from the csvs

# Read the CSV and Perform Basic Data Cleaning

In [5]:
red_wine_df = pd.read_csv("Resources/winequality-red.csv", sep=';')
white_wine_df = pd.read_csv("Resources/winequality-white.csv", sep=';')
white_wine_df.info()
red_wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64

## Unbalanced (Denormalized) Data for White Wine Data (4898 data points) and Red Wine Data (1599 data points) 

We want to use all the white and red wine data baseline condition we can use:

In [6]:
#if we want our only features of the model to consist of all white and red wine data, we use this data frame:
baseline_red_df = red_wine_df.copy()
baseline_white_df = white_wine_df.copy()
baseline_red_df["color"] = "red"
baseline_white_df["color"] = "white"
baseline_wine_df = pd.concat([baseline_red_df, baseline_white_df])
color_wine_df = pd.get_dummies(baseline_wine_df,columns=['color'])
colors_df = color_wine_df.drop(columns=["color_red"]).copy()
dummy_colors_df = colors_df.rename(columns={"color_white":"color"})
dummy_colors_df.to_csv("unbalanced-red-white-wine.csv")

In [31]:
all_wine_df = baseline_wine_df.drop(columns="color")

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193
9,5,5,5,5,5,5,5,5,5,5,5


In [5]:
dummy_colors_df.groupby('quality').count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,30,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193,193
9,5,5,5,5,5,5,5,5,5,5,5,5


In [6]:
#Suppose we want to take specific ranges in only_balanced_df, which does not include colors
range_colors_df = dummy_colors_df.loc[\
(dummy_colors_df["quality"] <= 8 ) & \
(dummy_colors_df["quality"] >= 3 )  \
] 


In [7]:
range_colors_df.groupby('quality').count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,30,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193,193


In [8]:
# no_colors_df = range_colors_df.drop(columns="color")
# no_colors_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [9]:
no_colors_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [10]:
no_colors_df.groupby("quality").count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193


In [11]:

new_col_bal_df = no_colors_df.copy()
new_col_bal_df["fsd/tsd"] = new_col_bal_df["free sulfur dioxide"]/new_col_bal_df["total sulfur dioxide"]
new_col_bal_df["alc/den"] = new_col_bal_df["alcohol"]/new_col_bal_df["density"]
new_col_bal_df["sug/den"] = new_col_bal_df["residual sugar"]/new_col_bal_df["density"]
col_bal_df = new_col_bal_df.drop(columns=['free sulfur dioxide','total sulfur dioxide',"alcohol","density","residual sugar"])
col_bal_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,pH,sulphates,quality,fsd/tsd,alc/den,sug/den
0,7.4,0.70,0.00,0.076,3.51,0.56,5,0.323529,9.420726,1.904189
1,7.8,0.88,0.00,0.098,3.20,0.68,5,0.373134,9.831461,2.608347
2,7.8,0.76,0.04,0.092,3.26,0.65,5,0.277778,9.829488,2.306921
3,11.2,0.28,0.56,0.075,3.16,0.58,6,0.283333,9.819639,1.903808
4,7.4,0.70,0.00,0.076,3.51,0.56,5,0.323529,9.420726,1.904189
...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,0.039,3.27,0.50,6,0.260870,11.300119,1.614303
4894,6.6,0.32,0.36,0.047,3.15,0.46,5,0.339286,9.649211,8.041009
4895,6.5,0.24,0.19,0.041,2.99,0.46,6,0.270270,9.470651,1.209019
4896,5.5,0.29,0.30,0.022,3.34,0.38,7,0.181818,12.946424,1.112583


In [12]:
new_col_bal_df.groupby("quality").count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,fsd/tsd,alc/den,sug/den
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3,30,30,30,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193,193,193,193


Add buckets to given range of quality values

### Moved quality values into buckets

Created bins for balanced data called: "terrible (0), mediocre (1), and great (2)"
The quality scores range from 0 to 4, 5 to 6, and 7 to 10. We will call them 0,1,2.

In [78]:
bin_balanced_df = new_col_bal_df.copy()
bins = [0, 4, 5, 6, 8]
group_names = [1, 2, 3, 4]
bin_balanced_df["bin_quality"] = pd.cut(bin_balanced_df["quality"], bins, labels=group_names)
bin_qual_df = bin_balanced_df.drop(columns=["quality",'residual sugar','free sulfur dioxide','free sulfur dioxide','density', 'alcohol'])
# bin_qual_df = bin_balanced_df.drop(columns=["quality"])

bin_qual_df
# columns=['bin_quality']
# pd.get_dummies(df, drop_first=True)
# bin_balanced_df.sample(30)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,pH,sulphates,fsd/tsd,alc/den,sug/den,bin_quality
0,7.4,0.70,0.00,0.076,34.0,3.51,0.56,0.323529,9.420726,1.904189,2
1,7.8,0.88,0.00,0.098,67.0,3.20,0.68,0.373134,9.831461,2.608347,2
2,7.8,0.76,0.04,0.092,54.0,3.26,0.65,0.277778,9.829488,2.306921,2
3,11.2,0.28,0.56,0.075,60.0,3.16,0.58,0.283333,9.819639,1.903808,3
4,7.4,0.70,0.00,0.076,34.0,3.51,0.56,0.323529,9.420726,1.904189,2
...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,0.039,92.0,3.27,0.50,0.260870,11.300119,1.614303,3
4894,6.6,0.32,0.36,0.047,168.0,3.15,0.46,0.339286,9.649211,8.041009,2
4895,6.5,0.24,0.19,0.041,111.0,2.99,0.46,0.270270,9.470651,1.209019,3
4896,5.5,0.29,0.30,0.022,110.0,3.34,0.38,0.181818,12.946424,1.112583,4


In [14]:
bin_qual_df.groupby("bin_quality").count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,fsd/tsd,alc/den,sug/den
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,246,246,246,246,246,246,246,246,246,246,246,246,246,246
2,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
3,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
4,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
5,193,193,193,193,193,193,193,193,193,193,193,193,193,193


Now we want to add colors with the convert quality scores to bins to see what happens

In [15]:
bins_colors_df = dummy_colors_df.copy()
bins = [0, 4, 5, 6, 7, 8]
group_names = [1, 2, 3, 4, 5]
bins_colors_df["bin_quality"] = pd.cut(bins_colors_df["quality"], bins, labels=group_names)
new_bins_colors_df = bins_colors_df.drop(columns="quality")
new_bins_colors_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color,bin_quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,2
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0,2
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0,3
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1,3
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,1,2
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1,3
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1,4


In [16]:
new_bins_colors_df.groupby("bin_quality").count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,246,246,246,246,246,246,246,246,246,246,246,246
2,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
3,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
4,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
5,193,193,193,193,193,193,193,193,193,193,193,193


In [64]:
# no_colors_df.groupby("quality").count()

In [27]:
dummy_colors_df.groupby("quality").count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,30,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193,193
9,5,5,5,5,5,5,5,5,5,5,5,5


# Select your features (columns)

In [47]:
# Set features. Drop quality, y value. 
X = all_wine_df.drop(columns=['quality'])

In [48]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


# Create a Train Test Split

Use `quality` for the y values

In [49]:
from sklearn.model_selection import train_test_split
#80% train, 20% test. y is this one column
y = all_wine_df['quality']
#random state 42 will have same picks for x test and y test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [50]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1539,7.2,0.39,0.32,1.8,0.065,34.0,60.0,0.99714,3.46,0.78,9.9
1109,10.8,0.47,0.43,2.1,0.171,27.0,66.0,0.9982,3.17,0.76,10.8
100,8.3,0.61,0.3,2.1,0.084,11.0,50.0,0.9972,3.4,0.61,10.2
3878,6.5,0.35,0.31,10.2,0.069,58.0,170.0,0.99692,3.18,0.49,9.4
4817,5.8,0.385,0.25,3.7,0.031,38.0,122.0,0.99128,3.2,0.63,11.2


# Train the Model using Random Forest 



In [51]:
#Train the model
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [52]:
#Test the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         9
           4       0.78      0.12      0.21        57
           5       0.67      0.75      0.71       518
           6       0.67      0.73      0.70       738
           7       0.69      0.57      0.62       259
           8       0.87      0.30      0.45        43
           9       0.00      0.00      0.00         1

    accuracy                           0.67      1625
   macro avg       0.52      0.35      0.38      1625
weighted avg       0.68      0.67      0.66      1625



  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
feature_list = list(X.columns)
feature_imp = pd.Series(rfc.feature_importances_,index=feature_list).sort_values(ascending=False)
print("The Accuracy Score =", accuracy_score(y_test, predictions))
feature_imp

The Accuracy Score = 0.6732307692307692


alcohol                 0.126464
volatile acidity        0.101247
density                 0.100758
total sulfur dioxide    0.089202
chlorides               0.086595
sulphates               0.086433
residual sugar          0.084941
pH                      0.084713
free sulfur dioxide     0.084622
citric acid             0.079955
fixed acidity           0.075069
dtype: float64

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'unbalanced-with-color.sav'
joblib.dump(predictions, filename)