In [1]:
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [3]:
red_wine_df = pd.read_csv("Resources/winequality-red.csv", sep=';')
white_wine_df = pd.read_csv("Resources/winequality-white.csv", sep=';')
# white_wine_df.info()
# red_wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64

### Unbalanced Data for both White Wine Data (4898 data points) and Red Wine Data (1599 data points) 

The model consists of all white and red wine data

In [4]:
baseline_red_df = red_wine_df.copy()
baseline_white_df = white_wine_df.copy()
baseline_red_df["color"] = "red"
baseline_white_df["color"] = "white"

baseline_wine_df = pd.concat([baseline_red_df, baseline_white_df])
all_wine_df = baseline_wine_df.drop(columns="color")

color_wine_df = pd.get_dummies(baseline_wine_df,columns=['color'])
colors_df = color_wine_df.drop(columns=["color_red"]).copy()

dummy_colors_df = colors_df.rename(columns={"color_white":"color"})
#all_wine_df

To test specific ranges in all_wine_df, not including colors, use: `range_unbalanced_df`

In [31]:
range_unbalanced_df = dummy_colors_df.loc[\
(dummy_colors_df["quality"] <= 8 ) & \
(dummy_colors_df["quality"] >= 3 )  \
]

Check the count for each quality score in `range_unbalanced_df`

In [6]:
unbalanced_count_df = range_unbalanced_df.groupby('quality').count()
#unbalanced_count_df

To test specific ranges in dummy_colors_df, including colors, use: `range_colors_df`

In [7]:
range_colors_df = dummy_colors_df.loc[\
(dummy_colors_df["quality"] <= 8 ) & \
(dummy_colors_df["quality"] >= 3 )  \
]

Check the count for each quality score in `range_colors_df`

In [8]:
colors_count_df = range_colors_df.groupby('quality').count()
#colors_count_df

## Feature Engineering:  Moving quality scores into buckets

Created THREE bins for unbalanced data called: "terrible (1), mediocre (2), and great (3)"
The quality scores range from 0 to 4, 5 to 6, and 7 to 10. We will call them 1,2,3.

In [9]:
three_bins_df = all_wine_df.copy()
bins = [0, 4, 6, 9]
group_names = [1, 2, 3]
three_bins_df["bin_quality"] = pd.cut(three_bins_df["quality"], bins, labels=group_names)
three_bins_qual_df = three_bins_df.drop(columns=["quality"])
#three_bins_qual_df

Check the count for each quality score in `three_bins_qual_df`

In [10]:
three_bins_count = three_bins_qual_df.groupby("bin_quality").count()
three_bins_count

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,246,246,246,246,246,246,246,246,246,246,246
2,4974,4974,4974,4974,4974,4974,4974,4974,4974,4974,4974
3,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277


Created FOUR bins for balanced data called: "terrible (1), mediocre (2), great (3),and terrific (4)"
The quality scores range from 0 to 4, 5, 6, and 7 to 9. We will call them 1,2,3,4.

In [32]:
four_bins_df = all_wine_df.copy()
bins = [0, 4, 5, 6, 9]
group_names = [1, 2, 3, 4]
four_bins_df["bin_quality"] = pd.cut(four_bins_df["quality"], bins, labels=group_names)
four_qual_bins_df = four_bins_df.drop(columns="quality")
#four_qual_bins_df

Check the count for each quality score in `four_qual_bins_df`

In [12]:
four_bins_count = four_qual_bins_df.groupby("bin_quality").count()
four_bins_count

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
bin_quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,246,246,246,246,246,246,246,246,246,246,246
2,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
3,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
4,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277,1277


# Select your features (columns)

Drop bin_quality column, if dataframes `four_qual_bins_df` or `three_bins_qual_df` are used. Use the following:


In [20]:
# X = four_qual_bins_df.drop(columns=['bin_quality'])
# X = three_bins_qual_df.drop(columns=['bin_quality'])

In [21]:
from sklearn.model_selection import train_test_split
# Set features.
X = four_qual_bins_df.drop(columns=['bin_quality'])


In [22]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


# Create a Train Test Split

Use `quality` for the y values

In [24]:
#80% train, 20% test. y is this one column
y = four_qual_bins_df['bin_quality']

#random state 42 will have same picks for x test and y test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [25]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1539,7.2,0.39,0.32,1.8,0.065,34.0,60.0,0.99714,3.46,0.78,9.9
1109,10.8,0.47,0.43,2.1,0.171,27.0,66.0,0.9982,3.17,0.76,10.8
100,8.3,0.61,0.3,2.1,0.084,11.0,50.0,0.9972,3.4,0.61,10.2
3878,6.5,0.35,0.31,10.2,0.069,58.0,170.0,0.99692,3.18,0.49,9.4
4817,5.8,0.385,0.25,3.7,0.031,38.0,122.0,0.99128,3.2,0.63,11.2


# Train the Model using Random Forest 



In [26]:
#Train the model
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
#Test the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.75      0.14      0.23        66
           2       0.68      0.75      0.72       518
           3       0.68      0.72      0.70       738
           4       0.72      0.62      0.67       303

    accuracy                           0.69      1625
   macro avg       0.71      0.56      0.58      1625
weighted avg       0.69      0.69      0.68      1625



In [28]:
#Feature importances as a percentage

feature_list = list(X.columns)
feature_imp = pd.Series(rfc.feature_importances_,index=feature_list).sort_values(ascending=False)
feature_imp

alcohol                 0.130144
density                 0.108376
volatile acidity        0.099717
total sulfur dioxide    0.089539
chlorides               0.087336
sulphates               0.085311
free sulfur dioxide     0.084068
residual sugar          0.081501
pH                      0.080723
citric acid             0.079249
fixed acidity           0.074036
dtype: float64

In [29]:
#The Accuracy score

print("The Accuracy Score =", accuracy_score(y_test, predictions))

The Accuracy Score = 0.688


In [30]:
#Our predictions vs actual predictions

print(predictions[:10])
print(y_test[:10])

[3 2 4 2 2 3 2 3 2 4]
1504    4
1419    2
3162    4
3091    3
2433    3
1297    3
174     2
3985    3
561     2
4347    4
Name: bin_quality, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]


# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

# import joblib
# filename = 'Saved Models/fdasfmdlafsalfms.sav'
# joblib.dump(predictions, filename)