In [3]:
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [5]:
red_wine_df = pd.read_csv("Resources/winequality-red.csv", sep=';')
white_wine_df = pd.read_csv("Resources/winequality-white.csv", sep=';')
white_wine_df.info()
red_wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64

### Balanced (Normalized) Data for White Wine Data (1599 data points) and Red Wine Data (1599 data points) 

Take a random sample of 1599 white wines from the total of 4898 white wines, so that the red and white wine data is balanced.

In [7]:
random_white_df = white_wine_df.sample(1599,random_state=42).reset_index(drop=True)

Only balanced values are baseline condition we can use:

In [8]:
#if we want our only features of the model to have balanced white wine and red wine data, we use this data frame:
baseline_red_df = red_wine_df.copy()
baseline_white_df = random_white_df.copy()
only_balanced_df = pd.concat([baseline_red_df, baseline_white_df])
#only_balanced_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.7,0.34,0.31,16.4,0.051,20.0,146.0,0.99834,3.06,0.54,9.1,5
1595,9.3,0.31,0.49,1.3,0.042,34.0,147.0,0.99480,3.11,0.46,9.8,5
1596,6.4,0.17,0.27,6.7,0.036,88.0,223.0,0.99480,3.28,0.35,10.2,6
1597,7.5,0.29,0.36,15.7,0.050,29.0,124.0,0.99680,3.06,0.54,10.4,5


Check the count for each quality score in `only_balanced_df`

In [24]:
only_balanced_count = only_balanced_df.groupby('quality').count()
#only_balanced_count

Take a specific ranges of quality scores in only_balanced_df (which does not include colors) called `range_balanced_df`


In [16]:
range_balanced_df = only_balanced_df.loc[\
(only_balanced_df["quality"] <= 7 ) & \
(only_balanced_df["quality"] >= 4 )  \
] 

#range_balanced_df

To check the counts for each quality score in `range_balanced_df`

In [25]:
ranged_balanced_count = range_balanced_df.groupby('quality').count()
#ranged_balanced_count

Added Color with get dummies (Or in case we want to remove it)

In [29]:
#if we want to include wine colors as features in our model:
color_red_df = red_wine_df.copy()
color_white_df = random_white_df.copy()
color_red_df["color"] = "red"
color_white_df["color"] = "white"
colors_df = pd.concat([color_red_df, color_white_df])

#if we only want to use the balanced data and include the colors of wine:
#0 is red, 1 is white


dummy_colors_df = pd.get_dummies(colors_df, columns=['color'])
dummy_drop_colors_df = dummy_colors_df.drop(columns=["color_red"]).copy()
dummy_rename_colors_df = dummy_drop_colors_df.rename(columns={"color_white":"colors"})

#dummy_rename_colors_df

If we want to hypertune further, and choose a specfic range of quality scores, so our model predicts a smaller range of quality scores

In [27]:
#For our model that includes our `colors` feature, if we want include a range of quality scores:
range_colors_df = dummy_rename_colors_df.loc[\
(dummy_rename_colors_df["quality"] <= 6 ) & \
(dummy_rename_colors_df["quality"] >= 3 )  \
] 
#range_colors_df

Quality score count of `range_colors_df`

In [28]:
range_colors_count = range_colors_df.groupby('quality').count()
#range_colors_count

## Created another dataframe that moves quality scores into bins

Created FOUR bins for balanced data called: "terrible (1), mediocre (2), great (3) and terrific (4)"
The quality scores range from 0 to 4, 5, 6, and 7 to 10. We will call the bins 1,2,3,4

In [33]:
four_bins_balanced_df = only_balanced_df.copy()
bins = [0, 4, 5, 6, 10]
group_names = [1, 2, 3, 4]
four_bins_balanced_df["bin_quality"] = pd.cut(four_bins_balanced_df["quality"], bins, labels=group_names)
four_bins_df = four_bins_balanced_df.drop(columns="quality")
four_bins_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,bin_quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,2
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,3
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.7,0.34,0.31,16.4,0.051,20.0,146.0,0.99834,3.06,0.54,9.1,2
1595,9.3,0.31,0.49,1.3,0.042,34.0,147.0,0.99480,3.11,0.46,9.8,2
1596,6.4,0.17,0.27,6.7,0.036,88.0,223.0,0.99480,3.28,0.35,10.2,3
1597,7.5,0.29,0.36,15.7,0.050,29.0,124.0,0.99680,3.06,0.54,10.4,2


In [34]:
four_bins_count = four_bins_df.groupby("bin_quality").count()
four_bins_count

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,114,114,114,114,114,114,114,114,114,114,114
2,1146,1146,1146,1146,1146,1146,1146,1146,1146,1146,1146
3,1363,1363,1363,1363,1363,1363,1363,1363,1363,1363,1363
4,575,575,575,575,575,575,575,575,575,575,575


Created THREE bins for balanced data called: "terrible (1), mediocre (2), and great (3)"
The quality scores range from 0 to 4, 5 to 6, and 7 to 10. We will call them 1,2,3.

In [53]:
three_bins_balanced_df = only_balanced_df.copy()
bins = [0, 4, 6, 10]
group_names = [1, 2, 3]
three_bins_balanced_df["bin_quality"] = pd.cut(three_bins_balanced_df["quality"], bins, labels=group_names)
three_bins_df = three_bins_balanced_df.drop(columns="quality")
three_bins_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,bin_quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,2
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,2
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.7,0.34,0.31,16.4,0.051,20.0,146.0,0.99834,3.06,0.54,9.1,2
1595,9.3,0.31,0.49,1.3,0.042,34.0,147.0,0.99480,3.11,0.46,9.8,2
1596,6.4,0.17,0.27,6.7,0.036,88.0,223.0,0.99480,3.28,0.35,10.2,2
1597,7.5,0.29,0.36,15.7,0.050,29.0,124.0,0.99680,3.06,0.54,10.4,2


In [54]:
three_bins_df.groupby("bin_quality").count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,114,114,114,114,114,114,114,114,114,114,114
2,2509,2509,2509,2509,2509,2509,2509,2509,2509,2509,2509
3,575,575,575,575,575,575,575,575,575,575,575


Finally, we binned it by the lowest count of lowest quality score to further balance the data:

In [52]:
new_bins_df = three_bins_df.copy()

In [None]:
bad_df = new_bins_df.loc[bin_balanced_df['bin_quality']==0,:]
good_df = new_bins_df.loc[bin_balanced_df['bin_quality']==1,:]
great_df = bin_balanced_df.loc[bin_balanced_df['bin_quality']==2,:]

# Select your features (columns)

In [46]:
# Set features. Drop quality, y value. 
X = four_bins_df.drop(columns=['bin_quality'])
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


# Create a Train Test Split

Use `quality` for the y values

In [47]:
from sklearn.model_selection import train_test_split
#80% train, 20% test. y is this one column
y = four_bins_df['bin_quality']
#random state 42 will have same picks for x test and y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1435,6.8,0.26,0.4,7.5,0.046,45.0,179.0,0.99583,3.2,0.49,9.3
977,5.9,0.26,0.21,12.5,0.034,36.0,152.0,0.9972,3.28,0.43,9.5
533,10.3,0.27,0.24,2.1,0.072,15.0,33.0,0.9956,3.22,0.66,12.8
1061,9.1,0.4,0.5,1.8,0.071,7.0,16.0,0.99462,3.21,0.69,12.5
1027,7.4,0.18,0.36,13.1,0.056,72.0,163.0,1.0,3.42,0.35,9.1


# Train the Model using Random Forest 



In [49]:
#Train the model
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
#Test the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.33      0.05      0.09        20
           2       0.68      0.70      0.69       243
           3       0.60      0.63      0.62       270
           4       0.65      0.62      0.63       107

    accuracy                           0.64       640
   macro avg       0.56      0.50      0.51       640
weighted avg       0.63      0.64      0.63       640



In [51]:

feature_list = list(X.columns)
feature_imp = pd.Series(rfc.feature_importances_,index=feature_list).sort_values(ascending=False)
print("The Accuracy Score =", accuracy_score(y_test, predictions))
feature_imp

The Accuracy Score = 0.6375


alcohol                 0.144521
volatile acidity        0.106860
density                 0.097289
sulphates               0.091001
total sulfur dioxide    0.087956
chlorides               0.086613
residual sugar          0.080677
free sulfur dioxide     0.079426
citric acid             0.078305
pH                      0.076500
fixed acidity           0.070853
dtype: float64

# Save the Model

In [52]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
# import joblib
# filename = 'Saved Models/balanced_colors_rfc.sav'
# joblib.dump(predictions, filename)

['Saved Models/balanced_colors_rfc.sav']