In [136]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [137]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

In [138]:
import pandas as pd
import numpy as np

Import our white and red wine data from the csvs

In [139]:
red_wine_df = pd.read_csv("winequality-red.csv", sep=';')
white_wine_df = pd.read_csv("winequality-white.csv", sep=';')
print(white_wine_df.count())
print(red_wine_df.count())

fixed acidity           4898
volatile acidity        4898
citric acid             4898
residual sugar          4898
chlorides               4898
free sulfur dioxide     4898
total sulfur dioxide    4898
density                 4898
pH                      4898
sulphates               4898
alcohol                 4898
quality                 4898
dtype: int64
fixed acidity           1599
volatile acidity        1599
citric acid             1599
residual sugar          1599
chlorides               1599
free sulfur dioxide     1599
total sulfur dioxide    1599
density                 1599
pH                      1599
sulphates               1599
alcohol                 1599
quality                 1599
dtype: int64


## Balanced (Normalized) Data for White Wine Data (1599 data points) and Red Wine Data (1599 data points) 

In [140]:
#take a random sample of white wines to match the red wine data
random_white_df = white_wine_df.sample(1599,random_state=1).reset_index(drop=True)

Only balanced values are baseline condition we can use:

In [156]:
#if we want our only features of the model to have balanced white wine and red wine data, we use this data frame:
baseline_red_df = red_wine_df.copy()
baseline_white_df = random_white_df.copy()
only_balanced_df = pd.concat([baseline_red_df, baseline_white_df])
only_balanced_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,5.4,0.29,0.38,1.2,0.029,31.0,132.0,0.98895,3.28,0.36,12.4,6
1595,6.3,0.41,0.33,4.7,0.023,28.0,110.0,0.99100,3.30,0.38,12.5,7
1596,6.8,0.29,0.32,1.8,0.032,18.0,130.0,0.99095,3.05,0.62,11.2,6
1597,6.3,0.36,0.28,2.5,0.035,18.0,73.0,0.98868,3.10,0.47,12.8,7


In [193]:
#Suppose we want to take specific ranges in only_balanced_df, which does not include colors
range_balanced_df = only_balanced_df.loc[\
(only_balanced_df["quality"] <= 7 ) & \
(only_balanced_df["quality"] >= 5 )  \
] 


In [194]:
range_balanced_df.groupby('quality').count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,1164,1164,1164,1164,1164,1164,1164,1164,1164,1164,1164
6,1338,1338,1338,1338,1338,1338,1338,1338,1338,1338,1338
7,516,516,516,516,516,516,516,516,516,516,516


In [159]:
#if we want to include wine colors as features in our model:
color_red_df = red_wine_df.copy()
color_white_df = random_white_df.copy()
color_red_df["color"] = "red"
color_white_df["color"] = "white"
colors_df = pd.concat([color_red_df, color_white_df])

Added Color with get dummies (Or in case we want to remove it)

In [145]:
#if we only want to use the balanced data and include the colors of wine:
dummy_colors_df = pd.get_dummies(colors_df, columns=['color'])
dummy_drop_colors_df = dummy_colors_df.drop(columns=["color_red"]).copy()
dummy_rename_colors_df = dummy_drop_colors_df.rename(columns={"color_white":"colors"})
dummy_rename_colors_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,colors
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,5.4,0.29,0.38,1.2,0.029,31.0,132.0,0.98895,3.28,0.36,12.4,6,1
1595,6.3,0.41,0.33,4.7,0.023,28.0,110.0,0.99100,3.30,0.38,12.5,7,1
1596,6.8,0.29,0.32,1.8,0.032,18.0,130.0,0.99095,3.05,0.62,11.2,6,1
1597,6.3,0.36,0.28,2.5,0.035,18.0,73.0,0.98868,3.10,0.47,12.8,7,1


If we want to hypertune further, and choose a specfic range of quality scores, so our model predicts a smaller range of quality scores

In [186]:
#For our model that includes our `colors` feature:
range_colors_df = dummy_rename_colors_df.loc[\
(dummy_rename_colors_df["quality"] <= 7 ) & \
(dummy_rename_colors_df["quality"] >= 5 )  \
] 


In [187]:
#To show which range we are using
range_colors_df.groupby('quality').count()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,colors
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5,1164,1164,1164,1164,1164,1164,1164,1164,1164,1164,1164,1164
6,1338,1338,1338,1338,1338,1338,1338,1338,1338,1338,1338,1338
7,516,516,516,516,516,516,516,516,516,516,516,516


Add buckets to given range of quality values

### Moved quality values into buckets

# Unbalanced (Denormalized) Data for White Wine Data (4898 data points) and Red Wine Data (1599 data points) 

# Read the CSV and Perform Basic Data Cleaning

In [247]:
only_balanced_df.alcohol = only_balanced_df.alcohol.astype(int)

In [255]:
dummy_rename_colors_df.alcohol = dummy_rename_colors_df.alcohol.astype(int)

In [248]:
only_balanced_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                   int64
quality                   int64
dtype: object

In [256]:
dummy_rename_colors_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                   int64
quality                   int64
colors                    uint8
dtype: object

# Select your features (columns)

In [264]:
# Set features. Drop quality, y value. 
X = dummy_rename_colors_df.drop(columns='colors')
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9,5


# Create a Train Test Split

Use `quality` for the y values

In [265]:
from sklearn.model_selection import train_test_split
#80% train, 20% test. y is this one column
y = dummy_rename_colors_df['colors']
#random state 42 will have same picks for x test and y test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [266]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1251,7.5,0.58,0.14,2.2,0.077,27.0,60.0,0.9963,3.28,0.59,9,5
1080,10.3,0.27,0.56,1.4,0.047,3.0,8.0,0.99471,3.16,0.51,11,6
272,6.5,0.32,0.3,2.3,0.051,20.0,127.0,0.98964,3.13,0.52,12,6
1187,7.0,0.43,0.3,2.0,0.085,6.0,39.0,0.99346,3.33,0.46,11,6
212,11.6,0.44,0.64,2.1,0.059,5.0,15.0,0.998,3.21,0.67,10,6


# Train the Model using Random Forest 



In [267]:
#Train the model
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [268]:
#Test the model
from sklearn.metrics import confusion_matrix, classification_report
predictions = rfc.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       418
           1       0.99      1.00      0.99       382

    accuracy                           0.99       800
   macro avg       0.99      0.99      0.99       800
weighted avg       0.99      0.99      0.99       800



# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'rfcredwhitewine567.sav'
joblib.dump(predictions, filename)