In [1]:
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [3]:
red_wine_df = pd.read_csv("Resources/winequality-red.csv", sep=';')
white_wine_df = pd.read_csv("Resources/winequality-white.csv", sep=';')
# white_wine_df.info()
# red_wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64

### Unbalanced Data for both White Wine Data (4898 data points) and Red Wine Data (1599 data points) 

The model consists of all white and red wine data

In [4]:
baseline_red_df = red_wine_df.copy()
baseline_white_df = white_wine_df.copy()
baseline_red_df["color"] = "red"
baseline_white_df["color"] = "white"

baseline_wine_df = pd.concat([baseline_red_df, baseline_white_df])
all_wine_df = baseline_wine_df.drop(columns="color")

color_wine_df = pd.get_dummies(baseline_wine_df,columns=['color'])
colors_df = color_wine_df.drop(columns=["color_red"]).copy()

dummy_colors_df = colors_df.rename(columns={"color_white":"color"})
#all_wine_df

To test specific ranges in all_wine_df, not including colors, use: `range_unbalanced_df`

In [31]:
range_unbalanced_df = dummy_colors_df.loc[\
(dummy_colors_df["quality"] <= 8 ) & \
(dummy_colors_df["quality"] >= 3 )  \
]

Check the count for each quality score in `range_unbalanced_df`

In [6]:
unbalanced_count_df = range_unbalanced_df.groupby('quality').count()
#unbalanced_count_df

To test specific ranges in dummy_colors_df, including colors, use: `range_colors_df`

In [7]:
range_colors_df = dummy_colors_df.loc[\
(dummy_colors_df["quality"] <= 8 ) & \
(dummy_colors_df["quality"] >= 3 )  \
]

Check the count for each quality score in `range_colors_df`

In [8]:
colors_count_df = range_colors_df.groupby('quality').count()
#colors_count_df

## Feature Engineering:  Moving quality scores into buckets

Created FOUR bins for unbalanced data called: "terrible (1), mediocre (2), great (3),and terrific (4)"
The quality scores range from 0 to 4, 5, 6, and 7 to 9. We will call the bins 1,2,3,4.

In [32]:
four_bins_df = all_wine_df.copy()
bins = [0, 4, 5, 6, 9]
group_names = [1, 2, 3, 4]
four_bins_df["bin_quality"] = pd.cut(four_bins_df["quality"], bins, labels=group_names)
four_qual_bins_df = four_bins_df.drop(columns="quality")
#four_qual_bins_df

Check the count for each quality score in `four_qual_bins_df`

In [12]:
four_bins_count = four_qual_bins_df.groupby("bin_quality").count()
four_bins_count

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,246,246,246,246,246,246,246,246,246,246,246
2,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
3,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
4,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277


Created THREE bins for unbalanced data called: "terrible (1), mediocre (2), and great (3)"
The quality scores range from 0 to 4, 5 to 6, and 7 to 10. We will call the bins 1,2,3.

In [35]:
three_bins_df = all_wine_df.copy()
bins = [0, 4, 6, 9]
group_names = [1, 2, 3]
three_bins_df["bin_quality"] = pd.cut(three_bins_df["quality"], bins, labels=group_names)
three_bins_qual_df = three_bins_df.drop(columns=["quality"])
three_bins_qual_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,bin_quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,2
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,2
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,2
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,2
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,2
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,3


Check the count for each quality score in `three_bins_qual_df`

In [36]:
three_bins_count = three_bins_qual_df.groupby("bin_quality").count()
three_bins_count

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,246,246,246,246,246,246,246,246,246,246,246
2,4974,4974,4974,4974,4974,4974,4974,4974,4974,4974,4974
3,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277


## Finally, we binned it by the lowest count of lowest quality score to further balance the data:

In [40]:
new_bins_df = three_bins_qual_df.copy()

In [41]:
bad_df = new_bins_df.loc[new_bins_df['bin_quality']==1,:]
good_df = new_bins_df.loc[new_bins_df['bin_quality']==2,:]
great_df = new_bins_df.loc[new_bins_df['bin_quality']==3,:]

In [42]:
normalized_good_df = good_df.sample(n=246,random_state=42)
normalized_great_df = great_df.sample(n=246,random_state=42)

In [60]:
bin_normal_df = pd.concat([bad_df,normalized_good_df,normalized_great_df])
bin_normal_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,bin_quality
18,7.4,0.59,0.08,4.4,0.086,6.0,29.0,0.9974,3.38,0.5,9.0,1
38,5.7,1.13,0.09,1.5,0.172,7.0,19.0,0.994,3.5,0.48,9.8,1
41,8.8,0.61,0.3,2.8,0.088,17.0,46.0,0.9976,3.26,0.51,9.3,1
45,4.6,0.52,0.15,2.1,0.054,8.0,65.0,0.9934,3.9,0.56,13.1,1
73,8.3,0.675,0.26,2.1,0.084,11.0,43.0,0.9976,3.31,0.53,9.2,1


# Select your features (columns)

Drop bin_quality column, if dataframes `four_qual_bins_df` or `three_bins_qual_df` are used. Use the following:


In [20]:
# X = four_qual_bins_df.drop(columns=['bin_quality'])
# X = three_bins_qual_df.drop(columns=['bin_quality'])

In [54]:
from sklearn.model_selection import train_test_split
# Set features.
X = bin_normal_df.drop(columns=['bin_quality'])


In [55]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
18,7.4,0.59,0.08,4.4,0.086,6.0,29.0,0.9974,3.38,0.5,9.0
38,5.7,1.13,0.09,1.5,0.172,7.0,19.0,0.994,3.5,0.48,9.8
41,8.8,0.61,0.3,2.8,0.088,17.0,46.0,0.9976,3.26,0.51,9.3
45,4.6,0.52,0.15,2.1,0.054,8.0,65.0,0.9934,3.9,0.56,13.1
73,8.3,0.675,0.26,2.1,0.084,11.0,43.0,0.9976,3.31,0.53,9.2


# Create a Train Test Split

Use `quality` for the y values

In [56]:
#80% train, 20% test. y is this one column
y = bin_normal_df['bin_quality']

#random state 42 will have same picks for x test and y test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [57]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
445,7.1,0.32,0.32,11.0,0.038,16.0,66.0,0.9937,3.24,0.4,11.5
1369,6.6,0.61,0.0,1.6,0.069,4.0,8.0,0.99396,3.33,0.37,10.4
3618,7.1,0.27,0.24,12.6,0.044,48.0,118.0,0.99726,3.04,0.56,10.0
3692,7.1,0.2,0.27,9.6,0.037,19.0,105.0,0.99444,3.04,0.37,10.5
3503,6.9,0.33,0.31,4.2,0.04,21.0,93.0,0.9896,3.18,0.48,13.4


# Train the Model using Random Forest 



In [58]:
#Train the model
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [59]:
#Test the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.69      0.72      0.71        69
           2       0.58      0.50      0.54        62
           3       0.70      0.78      0.74        54

    accuracy                           0.66       185
   macro avg       0.66      0.67      0.66       185
weighted avg       0.66      0.66      0.66       185



In [53]:
#Feature importances as a percentage

feature_list = list(X.columns)
feature_imp = pd.Series(rfc.feature_importances_,index=feature_list).sort_values(ascending=False)
feature_imp

alcohol                 0.124435
free sulfur dioxide     0.121318
volatile acidity        0.105437
total sulfur dioxide    0.102567
density                 0.089854
sulphates               0.085789
chlorides               0.085232
residual sugar          0.078190
pH                      0.070551
citric acid             0.069795
fixed acidity           0.066832
dtype: float64

In [29]:
#The Accuracy score

print("The Accuracy Score =", accuracy_score(y_test, predictions))

The Accuracy Score = 0.688


In [30]:
#Our predictions vs actual predictions

print(predictions[:10])
print(y_test[:10])

[3 2 4 2 2 3 2 3 2 4]
1504    4
1419    2
3162    4
3091    3
2433    3
1297    3
174     2
3985    3
561     2
4347    4
Name: bin_quality, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]


# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

# import joblib
# filename = 'Saved Models/fdasfmdlafsalfms.sav'
# joblib.dump(predictions, filename)